美文网首页
seurat单细胞数据处理小技巧

seurat单细胞数据处理小技巧

作者: 青青青山 | 来源:发表于2022-06-29 23:31 被阅读0次

    1 亚群合并

    当有几类亚群同属于某类细胞时,比如CD4+ T细胞和CD8+ T细胞均属于T细胞,想要将他们合并在一起时,可以使用此代码。

    #接seurat标准流程代码
    #命名pbmc为sce,表明其是seurat对象
    sce=pbmc
    
    Idents(sce)#细胞身份,属于哪个亚群
    levels(sce)#获取sce的亚群名
    > [1] "Naive CD4 T"  "CD14+ Mono"   "Memory > CD4 T"
    > [4] "B"            "CD8 T"        "FCGR3A+ > Mono"
    > [7] "NK"           "DC"           "Platelet"
    #可以看到Naive CD4 T,Memory CD4T及CD8 T都属于T细胞
    
    head(sce@meta.data) #可以看到meta.data中多了seurat_clusters列,由0-9构成。
    # 方法1 
    
    #根据levels(sce)可将要合并的亚群写作如下
    new.cluster.ids <- c("T", "Mono", "T", 
                         "B", "T", "Mono",
                         "T", "DC", "Platelet")
    
    names(new.cluster.ids) <- levels(sce)
    #更改前
    p1<-DimPlot(pbmc, reduction = 'umap', 
                label = TRUE, pt.size = 0.5) + NoLegend()
    
    #重命名亚群
    sce <- RenameIdents(sce, new.cluster.ids)
    #更改后
    p2<-DimPlot(sce, reduction = 'umap', 
            label = TRUE, pt.size = 0.5) + NoLegend()
    
    
    #方法2
    根据亚群对应关系,创建一个向量
    cluster2celltype <- c("0"="T",
                      "1"="Mono", 
                      "2"="T", 
                      "3"= "B", 
                      "4"= "T", 
                      "5"= "Mono",
                      "6"= "T", 
                      "7"= "DC", 
                      "8"= "Platelet") 
    #unname去掉对象的名(行名列名)
    sce[['cell_type']] = unname(cluster2celltype[sce@meta.data$seurat_clusters])
    #绘制散点图,reduction:使用哪种降维。如果没有指定,首先搜索 umap,然后是 tsne,然后是 pca;label:是否标记分群
    DimPlot(sce, reduction = 'umap', group.by = 'cell_type',
            label = TRUE, pt.size = 0.5) + NoLegend()#删除图例
    
    p3<-DimPlot(sce, reduction = 'umap', group.by = 'cell_type',
            label = TRUE, pt.size = 0.5) + NoLegend()#删除图例
    p1+p2+p3#此函数需要library(dplyr)
    

    推荐使用第二种方法,这样不更改原始亚群分群,只是在metadata中增加了一列

    2 提取子集

    当我们想把表达感兴趣基因的细胞提取出来单独分析时,可使用此函数。

    p1<-DimPlot(pbmc, reduction = 'umap', group.by ="seurat_clusters",label = TRUE, pt.size = 0.5) + NoLegend()
    p2<-FeaturePlot(pbmc, features = c("CD4"))
    p3<-DotPlot(sce, group.by = 'seurat_clusters',
               features = unique(genes_to_check)) + RotatedAxis()
    
    p1+p2+p3
    

    比如我们想把某几个亚群提取出来

    sce=pbmc
    #R 中,提取子集, 需要知道逻辑值,坐标及名字
    # seurat 取0,2分群的子集
    cd4_sce1 = sce[,sce@meta.data$seurat_clusters %in% c(0,2)]
    #取Naive CD4 T ,  Memory CD4 T 
    cd4_sce2 = sce[, Idents(sce) %in% c( "Naive CD4 T" ,  "Memory CD4 T" )]
    

    取子集后就是新的项目,需要重新标准化,重新用子集走一遍seurat标准流程

    3 如何分群是合理的

    我们知道FindClusters函数中不同的resolution参数会带来不同的结果,而且即使某个亚群内部的细胞也会有一定的异质性,那么到底分群多少是合适的呢?

    #先执行不同resolution 下的分群
    library(Seurat)
    library(clustree)
    sce <- FindClusters(
      object = sce,
      resolution = c(seq(from=.1,to=1.6,by=.2))
    ) 
    #clustree创建一个聚类树图,依赖于clustree包,显示不同分辨率的聚类之间的关系。prefix指示包含聚类信息的列的字符串
    colnames(sce@meta.data)
    pz<-clustree(sce@meta.data, prefix = "RNA_snn_res.")
    
    p2=DimPlot(sce, reduction = 'umap', group.by = 'RNA_snn_res.0.5',
               label = TRUE, pt.size = 0.5) + NoLegend()
    p3=DimPlot(sce, reduction = 'umap',group.by = 'RNA_snn_res.1.5',
               label = TRUE, pt.size = 0.5) + NoLegend()
    library(patchwork)
    p1+p2
    

    根据clustree可确定合适的分群数,若resolution设置过大,会导致分群过度,把原来分群的中应有的异质性也提炼出来单独作为一群。无论如何分群,都应该结合FindMarkers函数,确认亚群标志基因,结合生物学背景来解释分群结果。

    4 细胞亚群等比例抽样

    当数据特别大时,可以采用等比例抽样的方式,降低运算时间。

    sce=pbmc
    
    p1<-DoHeatmap(sce , 
              features = features, 
              size = 3
              )
    table(Idents(sce))
    #使用subset函数,每个亚群抽取15个细胞,做热图
    p2<-DoHeatmap(subset(sce, downsample = 15), 
              features = features, size = 3)+
      labs(title = paste("downsample = 15"))
    p1+p2
    
    # 每个细胞亚群抽10 
    allCells=names(Idents(sce))
    allType = levels(Idents(sce))
    
    choose_Cells = unlist(lapply(allType, function(x){
      cgCells = allCells[Idents(sce)== x ]
      cg=sample(cgCells,10) #sample 从指定的元素中获取指定大小的样本。
      cg
    }))
    #将抽取出的细胞组成一个新的sce对象
    cg_sce = sce[, allCells %in% choose_Cells]
    > An object of class Seurat 
    > 13714 features across 90 samples within 1 > assay 
    > Active assay: RNA (13714 features, 2000 > > variable features)
    >  2 dimensional reductions calculated: pca, umap
    
    as.data.frame(table(Idents(cg_sce)))
    

    5 如何查看基因的原始表达量

    # 如何看原始表达量 slot:要使用的数据槽,从“raw.data”、“data”或“scale.data”中选择;size 颜色条上方文字大小
    #不加slot默认是从之前2000个FindVariableFeatures基因作图
    DoHeatmap(subset(sce ), 
              features = features, 
              size = 3, slot='data'
    )
    

    6 多个亚群的多个标记基因可视化

    #首先需要将各个亚群的标记基因找出来
    sce.markers <- FindAllMarkers(object = sce, 
                   only.pos = TRUE,min.pct = 0.25, thresh.use = 0.25)
    
    #此时可以创建一个HTML小部件来显示矩形数据
    DT::datatable(sce.markers) 
    

    可以以交互形式的HTML格式查看各亚群的标记基因

    #挑选各个亚群差异基因的前5名
    top5 <- sce.markers %>% group_by(cluster) %>% top_n(5,avg_log2FC)
    
    head(top5)
    #检测并去除重复值 !表示“与、或、非”中的“非”的意思
    top5=top5[!duplicated(top5$gene),] 
    select_genes_all=split(top5$gene,top5$cluster)#将表拆分成列表
    select_genes_all
    DotPlot(object = sce, features=select_genes_all, 
            assay = "RNA") + theme(axis.text.x = element_text(angle= 45,  vjust = 0.5, hjust=0.5))
    

    不同亚群多个差异基因的可视化

    7 对任意细胞亚群做差异分析

    #若没有运行过FindAllMarkers,运行FindAllMarkers函数,查找各亚群标记基因
    sce.markers <- FindAllMarkers(object = sce, 
                   only.pos = TRUE,min.pct = 0.25, thresh.use = 0.25)
    
    #table使用交叉分类的因素建立一个列联表,计数在每个组合的因素水平。
    table(sce.markers$cluster)
    
    Naive CD4 T   CD14+ Mono Memory CD4 T            B        CD8 T 
             164          391          178          148          144 
    FCGR3A+ Mono           NK           DC     Platelet 
             608          369          633          242 
    

    挑选亚群和对应基因

    # 挑选细胞,若挑选多个亚群细胞,可采用正则表达式选择,比如挑选 Mono和T细胞:grepl(("Mono|T") , Idents(sce )),这里只挑选Mono类亚群
    kp=grepl(("Mono") , Idents(sce ))   
    table(kp)
    sce=sce[,kp]
    sce
    table( Idents(sce ))
    
    # 挑选对应的标记基因
    kp=grepl('Mono',sce.markers$cluster)#返回匹配与否的逻辑值
    table(kp)
    cg_sce.markers = sce.markers [ kp ,]
    #这里认为vg_log2FC>2的才是标记基因
    cg_sce.markers=cg_sce.markers[cg_sce.markers$avg_log2FC>2,]
    

    查找两群的差异基因

    markers_df <- FindMarkers(object = sce, 
                     ident.1 = 'FCGR3A+ Mono',#ident 聚类名
                    ident.2 = 'CD14+ Mono',
                     #logfc.threshold = 0,
                      min.pct = 0.25)
                      
    head(markers_df)
    

    筛选差异基因

    #选取avg_log2FC绝对值大于1基因
    cg_markers_df=markers_df[abs(markers_df$avg_log2FC) >1,]
    
    dim(cg_markers_df)
    
    DoHeatmap(subset(sce, downsample = 50),
              slot = 'counts',
              unique(rownames(cg_markers_df)),size=3) 
    #取两个基因集的交集(基因在某个群中即高表达,又有差异)
    intersect(rownames(cg_markers_df) ,
              cg_sce.markers$gene)
    
    

    8 人为划分亚群挑选差异基因

    当我们对表达某类基因的细胞感兴趣时,可以把表达这类基因的的细胞挑选出来,单独划分为一个群。

    highCells= colnames(subset(x = sce, subset = FCGR3A > 1,slot = 'counts')) 
    
    #a %in% table,a值是否包含于table中,为真输出TURE,否者输出FALSE
    highORlow=ifelse(colnames(sce) %in% highCells,'high','low')
    
    table(highORlow)
    table(Idents(sce))
    table(Idents(sce),highORlow)
    
    #将highORlow添加到meta.data中
    sce@meta.data$highORlow<-highORlow
    
    #查找划分出的亚群与其他亚群的差异基因
    markers <- FindMarkers(sce, ident.1 = "high", group.by = 'highORlow' )
    head(markers)
    
    cg_markers=markers[abs(markers$avg_log2FC) >1,]
    
    dim(cg_markers)
    
    DoHeatmap(subset(sce, downsample = 15),
              rownames(cg_markers) ,size=3 ) 
    
    

    9 查看任意文献的单细胞亚群标记基因

    可以直接使用DotPlot函数以文献中标记基因作图

    #若想查看下述marker_genes在T细胞中的表达,首先需要将T细胞相关亚群提取出来
    
    sce=pbmc
    table(Idents(sce))
    #提取单细胞相关亚群
    t_sce = sce[, Idents(sce) %in% c("Naive CD4 T" ,  "Memory CD4 T" , "CD8 T","NK")]  
    
    # 然后进行标准的降维聚类分群,代码不要变动
    sce=t_sce
    sce <- NormalizeData(sce, normalization.method = "LogNormalize",
                         scale.factor = 1e4) 
    sce <- FindVariableFeatures(sce, selection.method = 'vst',
                                nfeatures = 2000)
    sce <- ScaleData(sce, vars.to.regress = "percent.mt")
    sce <- RunPCA(sce, features = VariableFeatures(object = sce)) 
    sce <- FindNeighbors(sce, dims = 1:10)
    sce <- FindClusters(sce, resolution = 1 )
    head(Idents(sce), 5)
    table(sce$seurat_clusters) 
    sce <- RunUMAP(sce, dims = 1:10)
    DimPlot(sce, reduction = 'umap')
    

    查看文献中提到的基因

    marker_genes=c("LEF1","TCF7","SELL","IL7R","CD40LG","ANXA1","FOS","JUN","FOXP3","SAT1","IL2RA","CTLA4","PDCD1","CXCL13","CD200","TNFRSF18","CCR7","NELL2","CD55","KLF2","TOB1","ZNF683","CCL5","GZMK","EOMES","ITM2C","CX3CR1","GNLY","GZMH","GZMB","LAG3","CCL4L2","FCGR3A","FGFBP2","TYROBP","AREG","XCL1","KLRC1","TRDV2","TRGV9","MTRNR2L8","KLRD1","TRDV1","KLRC3","CTSW","CD7","MKI67","STMN1","TUBA1B","HIST1H4C" )
    
    DotPlot(sce, features = marker_genes,
                 assay='RNA'  )  + coord_flip() +
      theme(axis.text.x = element_text(angle = 90))
    

    10 多个单细胞亚群各自标记基因联合进行GO及kegg数据库注释

    #加载需要的包
    library(Seurat)
    library(gplots)
    library(ggplot2)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    
    #加载上述cg_sce.markers
    pro='all'
    load('sce.markers.all_10_celltype.Rdata')
    table(sce.markers$cluster)
    
    #基因ID转换bitr(geneID, fromType, toType, OrgDb, drop = TRUE)
    ids=bitr(sce.markers$gene,'SYMBOL','ENTREZID','org.Hs.eg.db')
    
    #合并数据
    sce.markers=merge(sce.markers,ids,by.x='gene',by.y='SYMBOL')
    
    #拆分id和分群,将ENTREZID拆分成cluster定义的组 
    gcSample=split(sce.markers$ENTREZID, sce.markers$cluster)
    
    #比较亚群功能
    xx <- compareCluster(gcSample, fun="enrichKEGG",organism="hsa", pvalueCutoff=0.05)
    p=dotplot(xx) 
    p+ theme(axis.text.x = element_text(angle = 45,  vjust = 0.5, hjust=0.5))
    p
    

    对相关基因进行富集分析

    #将上升下降的差异基因挑选出来
    deg=FindMarkers(object = sce, ident.1 = 'FCGR3A+ Mono',ident.2 = 'CD14+ Mono', #logfc.threshold = 0,min.pct = 0.25)
    
    gene_up=rownames(deg[deg$avg_log2FC>0,])
    gene_down=rownames(deg[deg$avg_log2FC < 0,])
    
    library(org.Hs.eg.db)
    #把SYMBOL改为ENTREZID,这里会损失一部分无法匹配到的
    gene_up=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db, keys = gene_up,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
    
    gene_down=as.character(na.omit(AnnotationDbi::select(org.Hs.eg.db,keys = gene_down,columns = 'ENTREZID',keytype = 'SYMBOL')[,2]))
    
    library(clusterProfiler)
    
    
    

    这里用到了生信技能树-健明老师的一键分析代码,感谢健明老师的无私分享

    ## KEGG pathway analysis
    ### 做KEGG数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
    run_kegg <- function(gene_up,gene_down,pro='test'){
      library(ggplot2)
      gene_up=unique(gene_up)
      gene_down=unique(gene_down)
      gene_diff=unique(c(gene_up,gene_down))
      ###   over-representation test
      # 下面把3个基因集分开做超几何分布检验
      # 首先是上调基因集。
      kk.up <- enrichKEGG(gene         = gene_up,
                          organism     = 'hsa',
                          #universe     = gene_all,
                          pvalueCutoff = 0.9,
                          qvalueCutoff =0.9)
      head(kk.up)[,1:6]
      kk=kk.up
      dotplot(kk)
      kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.up.csv'))
      
      # 首先是下调基因集。
      kk.down <- enrichKEGG(gene         =  gene_down,
                            organism     = 'hsa',
                            #universe     = gene_all,
                            pvalueCutoff = 0.9,
                            qvalueCutoff =0.9)
      head(kk.down)[,1:6]
      kk=kk.down
      dotplot(kk)
      kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.down.csv'))
      
      # 最后是上下调合并后的基因集。
      kk.diff <- enrichKEGG(gene         = gene_diff,
                            organism     = 'hsa',
                            pvalueCutoff = 0.05)
      head(kk.diff)[,1:6]
      kk=kk.diff
      dotplot(kk)
      kk=DOSE::setReadable(kk, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
      write.csv(kk@result,paste0(pro,'_kk.diff.csv'))
      
      
      kegg_diff_dt <- as.data.frame(kk.diff)
      kegg_down_dt <- as.data.frame(kk.down)
      kegg_up_dt <- as.data.frame(kk.up)
      down_kegg<-kegg_down_dt[kegg_down_dt$pvalue<0.01,];down_kegg$group=-1
      up_kegg<-kegg_up_dt[kegg_up_dt$pvalue<0.01,];up_kegg$group=1
      #画图设置, 这个图很丑,大家可以自行修改。
      g_kegg=kegg_plot(up_kegg,down_kegg)
      print(g_kegg)
      
      ggsave(g_kegg,filename = paste0(pro,'_kegg_up_down.png') )
      
      if(F){
        ###  GSEA 
        ## GSEA算法跟上面的使用差异基因集做超几何分布检验不一样。
        kk_gse <- gseKEGG(geneList     = geneList,
                          organism     = 'hsa',
                          nPerm        = 1000,
                          minGSSize    = 20,
                          pvalueCutoff = 0.9,
                          verbose      = FALSE)
        head(kk_gse)[,1:6]
        gseaplot(kk_gse, geneSetID = rownames(kk_gse[1,]))
        gseaplot(kk_gse, 'hsa04110',title = 'Cell cycle') 
        kk=DOSE::setReadable(kk_gse, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
        tmp=kk@result
        write.csv(kk@result,paste0(pro,'_kegg.gsea.csv'))
        
        
        # 这里找不到显著下调的通路,可以选择调整阈值,或者其它。
        down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
        up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
        
        g_kegg=kegg_plot(up_kegg,down_kegg)
        print(g_kegg)
        ggsave(g_kegg,filename = paste0(pro,'_kegg_gsea.png'))
        # 
      }
      
    }
    
    ### GO database analysis 
    ### 做GO数据集超几何分布检验分析,重点在结果的可视化及生物学意义的理解。
    run_go <- function(gene_up,gene_down,pro='test'){
      gene_up=unique(gene_up)
      gene_down=unique(gene_down)
      gene_diff=unique(c(gene_up,gene_down))
      g_list=list(gene_up=gene_up,
                  gene_down=gene_down,
                  gene_diff=gene_diff)
      # 因为go数据库非常多基因集,所以运行速度会很慢。
      if(T){
        go_enrich_results <- lapply( g_list , function(gene) {
          lapply( c('BP','MF','CC') , function(ont) {
            cat(paste('Now process ',ont ))
            ego <- enrichGO(gene          = gene,
                            #universe      = gene_all,
                            OrgDb         = org.Hs.eg.db,
                            ont           = ont ,
                            pAdjustMethod = "BH",
                            pvalueCutoff  = 0.99,
                            qvalueCutoff  = 0.99,
                            readable      = TRUE)
            
            print( head(ego) )
            return(ego)
          })
        })
        save(go_enrich_results,file =paste0(pro, '_go_enrich_results.Rdata'))
        
      }
      # 只有第一次需要运行,就保存结果哈,下次需要探索结果,就载入即可。
      load(file=paste0(pro, '_go_enrich_results.Rdata'))
      
      n1= c('gene_up','gene_down','gene_diff')
      n2= c('BP','MF','CC') 
      for (i in 1:3){
        for (j in 1:3){
          fn=paste0(pro, '_dotplot_',n1[i],'_',n2[j],'.png')
          cat(paste0(fn,'\n'))
          png(fn,res=150,width = 1080)
          print( dotplot(go_enrich_results[[i]][[j]] ))
          dev.off()
        }
      }
      
      
    }
    
    kegg_plot <- function(up_kegg,down_kegg){
      dat=rbind(up_kegg,down_kegg)
      colnames(dat)
      dat$pvalue = -log10(dat$pvalue)
      dat$pvalue=dat$pvalue*dat$group 
      
      dat=dat[order(dat$pvalue,decreasing = F),]
      
      g_kegg<- ggplot(dat, aes(x=reorder(Description,order(pvalue, decreasing = F)), y=pvalue, fill=group)) + 
        geom_bar(stat="identity") + 
        scale_fill_gradient(low="blue",high="red",guide = FALSE) + 
        scale_x_discrete(name ="Pathway names") +
        scale_y_continuous(name ="log10P-value") +
        coord_flip() + theme_bw()+theme(plot.title = element_text(hjust = 0.5))+
        ggtitle("Pathway Enrichment") 
    }
    

    运行以上代码后,就可以使用run_kegg,run_go和kegg_plot函数了

    #直接利用脚本中的代码对正、负相关基因进行kegg富集分析
    run_kegg(gene_up,gene_down,pro='test')
    
    # 下面的 run_go 会比较慢,根据需要
    # run_go(gene_up,gene_down,pro='shRNA')
    
    #对正相关基因进行富集分析
    go <- enrichGO(gene_up, OrgDb = "org.Hs.eg.db", ont="all") 
    library(ggplot2)
    library(stringr)
    p1<-barplot(go, split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free")
    go=DOSE::setReadable(go, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
    write.csv( go@result,file = 'gene_up_GO_all_barplot.csv')
    
    #对负相关基因进行富集分析
    go <- enrichGO(gene_down, OrgDb = "org.Hs.eg.db", ont="all") 
    p2<-barplot(go, split="ONTOLOGY")+ facet_grid(ONTOLOGY~., scale="free") 
     
    go=DOSE::setReadable(go, OrgDb='org.Hs.eg.db',keyType='ENTREZID')
    write.csv( go@result,file = 'gene_down_GO_all_barplot.csv')
    
    p1+p2
    

    参考来源
    公众号:生信技能树
    #section 3已更新#「生信技能树」单细胞公开课2021_哔哩哔哩_bilibili

    致谢
    I thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.

    THE END

    相关文章

      网友评论

          本文标题:seurat单细胞数据处理小技巧

          本文链接:https://www.haomeiwen.com/subject/bsvqbrtx.html