美文网首页Single Cell RNA-seq生信
Seurat:Guided Clustering Tutoria

Seurat:Guided Clustering Tutoria

作者: 五谷吃不完了 | 来源:发表于2019-03-30 20:40 被阅读62次
    说明:仅根据官网指南加个人理解,相应图片参考官网(目前官网上最新的Tutorial已经更新成Seurat3.0版本,下面的流程是2.4版本,有些许出入。新版本将会在2019年4月16日通过CRAN下载)

    以下是Seurat(v2.4版本)标准的数据处理过程,包括构建Seurat对象、QC、数据标准化、检测高变异基因等

    1.构建Seurat对象

    library(Seurat)
    library(dplyr)
    #读取10X数据
    pbmc.data <- Read10X(data.dir = "~/Downloads/filtered_gene_bc_matrices/hg19/")
    #构建Seurat对象,这里会有个初筛,保证所有基因在至少3个细胞中表达(0.1%细胞数),保证每个细胞至少能检测到200个基因。
    pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")
    

    2. QC选择细胞进行后续分析

    #UMI (Unique Molecular Identifier)是一段固定长度的随机小片段,用以区别不同的PCR扩增模板
    #barcode,标记不同的样品或者细胞
    
    #根据细胞内基因数以及线粒体基因所占比例进行过滤细胞
    #提取线粒体DNA并计算其比例
    mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
    percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
    
    #使用AddMetaData函数,将上面合并到pbmc对象中,方便后续的QC
    pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
    VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)
    
    #过滤细胞,细胞数控制在200~2500之间,如果不希望设定阈值,用-Inf表示,线粒体DNA所占比例控制在0.05以内
    pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
    

    3.对数据进行标准化

    pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
    

    4.在单细胞水平上检测变异基因

    #这里参数的设置根据数据的类型、样本的异质性以及标准化的方法,存在差异
    pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
    #查看筛选出来的变异基因的数量,一般是2000多
    length(x = pbmc@var.genes)
    

    5.除去不想要的变异来源

    #这里就包括除去技术水平的噪音、批次效应、细胞状态等
    pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
    

    6.PCA降维处理

    pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5)
    
    #通过下面的几种function可以查看定义特定PC的细胞和基因
    PrintPCA(object = pbmc, pcs.print = 1:5, genes.print = 5, use.full = FALSE)
    VizPCA(object = pbmc, pcs.use = 1:2)
    PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)
    
    #PCHeatmap这个功能可以很清楚的展示一个数据集的异质性,同时对于确定下游分析中用哪一个PC有着很大的帮助。对于探索相关的基因集合有着很大的帮助
    PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)#用PC1作图
    PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, label.columns = FALSE, use.full = FALSE)#用PC1~12作图
    

    7.确定在统计学上显著的PC

    #这一步耗时很长
    pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
    #这一步可以看一下每一个PC的相关性和显著性,实线在虚线上方
    JackStrawPlot(object = pbmc, PCs = 1:12)
    #确定需要选取的PC,通过曲线的走势
    PCElbowPlot(object = pbmc)
    

    8.对细胞进行聚类

    #resolution这一参数一般设定在0.6~1.2之间(细胞数3K左右),聚类结果放在object@ident中
    pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.6, print.output = 0, save.SNN = TRUE)
    

    9.进行非线性降维(tSNE)

    #tSNE的输入,建议和之前聚类所用的PC一样
    pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
    #这一步,你可以添加参数do.label=T,显示每一类群的label
    TSNEPlot(object = pbmc)
    #保存这一对象
    saveRDS(pbmc, file = "~/Projects/datasets/pbmc3k/pbmc_tutorial.rds")
    

    10. 找差异表达的基因(聚类marker)

    #找到cluster1所对应的marker(positive+negative)
    #ident.1 用来指定cluster
    #min.pct 用来指定特定基因需要在%的细胞中被检测到
    cluster1.markers <- FindMarkers(object = pbmc, ident.1 = 1, min.pct = 0.25)
    print(x = head(x = cluster1.markers, n = 5))
    
    #找到区分cluster5和cluster0,3的marker
    cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
    print(x = head(x = cluster5.markers, n = 5))
    
    #找到区分所有cluster的marker
    #只展示positive
    pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
    pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
    
    #可视化marker的表达
    VlnPlot(object = pbmc, features.plot = c("MS4A1", "CD79A"))
    #可以选择用raw UMI counts
    VlnPlot(object = pbmc, features.plot = c("NKG7", "PF4"), use.raw = TRUE, y.log = TRUE)
    
    #用PCA或者tSNE图可视化基因的表达
    FeaturePlot(object = pbmc, features.plot = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"), cols.use = c("grey", "blue"), reduction.use = "tsne")
    
    #针对给定的细胞和基因,用DoHeatmap函数画出表达热图
    #这里选定的是每个cluster中表达量前十的基因
    top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
    #slim.col.label= TRUE表示,只展示cluster的ID而不是名字
    DoHeatmap(object = pbmc, genes.use = top10$gene, slim.col.label = TRUE, remove.key = TRUE)
    
    VlnPlot
    FeaturePlot
    DoHeatmap

    11.更改cluster的名字(确定细胞类型的前提下)

    current.cluster.ids <- c(0, 1, 2, 3, 4, 5, 6, 7)
    new.cluster.ids <- c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
    pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
    TSNEPlot(object = pbmc, do.label = TRUE, pt.size = 0.5)
    

    12.进一步细分细胞类型

    #如果你修改分辨率参数resolution或者改变PC的个数,有可能会使得原来的细胞类群进一步的细分。这样你可以进一步探究在这个细分的cluster中,两者之间的marker有什么区别
    #在此之前,需要对新的聚类群进行重命名,不然会导致之前的结果被覆盖
    
    #分辨率为0.6
    pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
    #分辨率为0.8
    pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, resolution = 0.8, print.output = FALSE)
    
    #出图
    plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE)
    plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6", no.legend = TRUE, do.label = TRUE)
    plot_grid(plot1, plot2)
    

    相关文章

      网友评论

        本文标题:Seurat:Guided Clustering Tutoria

        本文链接:https://www.haomeiwen.com/subject/qkosbqtx.html