美文网首页scRNA单细胞测序单细胞实践
单细胞多样本整合之harmony

单细胞多样本整合之harmony

作者: 小洁忘了怎么分身 | 来源:发表于2022-07-11 23:26 被阅读0次

    GSE117570

    1.下载和读取数据

    rm(list = ls())
    library(stringr)
    library(Seurat)
    library(dplyr)
    f = dir("GSE117570_RAW/");f
    ## [1] "GSM3304007_P1_Tumor_processed_data.txt.gz"
    ## [2] "GSM3304011_P3_Tumor_processed_data.txt.gz"
    ## [3] "GSM3304013_P4_Tumor_processed_data.txt.gz"
    s = str_split(f,"_",simplify = T)[,1];s
    ## [1] "GSM3304007" "GSM3304011" "GSM3304013"
    scelist = list()
    for(i in 1:length(f)){
      tmp = read.table(paste0("GSE117570_RAW/",f[[i]]),
                     check.names = F)
      tmp = as.matrix(tmp)
    
      scelist[[i]] <- CreateSeuratObject(counts = tmp, 
                               project = s[[i]], 
                               min.cells = 3, 
                               min.features = 50)
      scelist[[i]][["percent.mt"]] <- PercentageFeatureSet(scelist[[i]], pattern = "^MT-")
      scelist[[i]] <- subset(scelist[[i]], subset = percent.mt < 5)
    }
    names(scelist)  = s
    lapply(scelist, function(x)dim(x@assays$RNA@counts))
    ## $GSM3304007
    ## [1] 6611 1466
    ## 
    ## $GSM3304011
    ## [1] 3211  116
    ## 
    ## $GSM3304013
    ## [1] 7492  281
    

    2.合并到一起,找高变基因

    sce.all <- merge(scelist[[1]],
                     y= scelist[ -1 ] ,
                     add.cell.ids =  s) 
    
    # 查看数据
    head(sce.all@meta.data, 10)
    ##                               orig.ident nCount_RNA nFeature_RNA percent.mt
    ## GSM3304007_AAACCTGGTACAGACG-1 GSM3304007       4338         1224   2.512679
    ## GSM3304007_AAACGGGGTAGCGCTC-1 GSM3304007      11724         2456   2.021494
    ## GSM3304007_AAACGGGGTCCTCTTG-1 GSM3304007       3353          726   2.117507
    ## GSM3304007_AAACGGGTCCAAACAC-1 GSM3304007       2811          986   2.312344
    ## GSM3304007_AAACGGGTCTTTAGTC-1 GSM3304007       6095         1590   3.002461
    ## GSM3304007_AAAGATGCATCTACGA-1 GSM3304007       7683         2049   2.824418
    ## GSM3304007_AAAGATGCATTGAGCT-1 GSM3304007       2428          832   2.800659
    ## GSM3304007_AAAGATGGTTCAGTAC-1 GSM3304007       2939         1052   3.028241
    ## GSM3304007_AAAGATGTCTGGTATG-1 GSM3304007       6834         1452   3.906936
    ## GSM3304007_AAAGCAAAGACTACAA-1 GSM3304007       3429          737   2.391368
    table(sce.all@meta.data$orig.ident) 
    ## 
    ## GSM3304007 GSM3304011 GSM3304013 
    ##       1466        116        281
    VlnPlot(sce.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
    
    FeatureScatter(sce.all, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
    
    sce.all <- NormalizeData(sce.all)
    sce.all <- FindVariableFeatures(sce.all)
    
    # Identify the 10 most highly variable genes
    top10 <- head(VariableFeatures(sce.all), 10)
    
    # plot variable features with and without labels
    plot1 <- VariableFeaturePlot(sce.all)
    plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
    plot2
    

    3. PCA

    完成PCA线性降维,是后续其他降维方法的基础

    sce.all <- ScaleData(sce.all)
    sce.all[["RNA"]]@counts[1:4,1:4]
    ## 4 x 4 sparse Matrix of class "dgCMatrix"
    ##            GSM3304007_AAACCTGGTACAGACG-1 GSM3304007_AAACGGGGTAGCGCTC-1
    ## FO538757.2                             .                             .
    ## AP006222.2                             .                             .
    ## NOC2L                                  .                             .
    ## ISG15                                 11                             3
    ##            GSM3304007_AAACGGGGTCCTCTTG-1 GSM3304007_AAACGGGTCCAAACAC-1
    ## FO538757.2                             .                             .
    ## AP006222.2                             .                             .
    ## NOC2L                                  .                             .
    ## ISG15                                  .                             4
    sce.all <- RunPCA(sce.all, features = VariableFeatures(sce.all))
    DimPlot(sce.all, reduction = "pca")
    
    # 应该选多少个主成分进行后续分析
    ElbowPlot(sce.all)
    
    sce.all <- JackStraw(sce.all, num.replicate = 100)
    sce.all <- ScoreJackStraw(sce.all, dims = 1:20)
    JackStrawPlot(sce.all, dims = 1:20)
    

    4.harmony和UMAP

    harmony用于整合数据时消除样本间的差异,上一篇是用Seurat自带的CCA方法整合。

    library(harmony)
    sce.all <- RunHarmony(sce.all, "orig.ident")
    names(sce.all@reductions)
    ## [1] "pca"     "harmony"
    sce.all <- RunUMAP(sce.all,  dims = 1:15, 
                         reduction = "harmony")
    DimPlot(sce.all,reduction = "umap",label=T ) 
    
    sce.all <- FindNeighbors(sce.all,reduction = "harmony",dims = 1:15)
    sce.all <- FindClusters(sce.all, resolution = 0.2) #分辨率
    ## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
    ## 
    ## Number of nodes: 1863
    ## Number of edges: 64311
    ## 
    ## Running Louvain algorithm...
    ## Maximum modularity in 10 random starts: 0.9515
    ## Number of communities: 9
    ## Elapsed time: 0 seconds
    # 结果聚成几类,用Idents查看
    length(levels(Idents(sce.all)))
    ## [1] 9
    table(sce.all@active.ident) 
    ## 
    ##   0   1   2   3   4   5   6   7   8 
    ## 667 382 246 172 127 101  81  57  30
    DimPlot(sce.all,reduction = "umap",label=T ) 
    

    5.SingleR注释

    # 注释
    library(celldex)
    library(SingleR)
    #ref <- celldex::HumanPrimaryCellAtlasData()
    #因为下载太慢,使用了提前存好的本地数据
    ref <- get(load("../single_ref/ref_Hematopoietic.RData"))
    library(BiocParallel)
    pred.scRNA <- SingleR(test = sce.all@assays$RNA@data, 
                          ref = ref,
                          labels = ref$label.main, 
                          clusters = sce.all@active.ident)
    pred.scRNA$pruned.labels
    ## [1] "Monocytes"       "CD4+ T cells"    "Erythroid cells" "B cells"        
    ## [5] "Monocytes"       "CD4+ T cells"    "CD8+ T cells"    "HSCs"           
    ## [9] "HSCs"
    plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
    
    new.cluster.ids <- pred.scRNA$pruned.labels
    names(new.cluster.ids) <- levels(sce.all)
    levels(sce.all)
    ## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8"
    sce.all <- RenameIdents(sce.all,new.cluster.ids)
    levels(sce.all)
    ## [1] "Monocytes"       "CD4+ T cells"    "Erythroid cells" "B cells"        
    ## [5] "CD8+ T cells"    "HSCs"
    UMAPPlot(object = sce.all, pt.size = 0.5, label = TRUE)
    

    相关文章

      网友评论

        本文标题:单细胞多样本整合之harmony

        本文链接:https://www.haomeiwen.com/subject/pwjobrtx.html