2023-09-20单细胞入门实践(从质控到细胞类型注释)

作者: 麦冬花儿 | 来源:发表于2023-09-20 08:47 被阅读0次

单细胞转录组细胞类型注释分析合集
GSEA也能做细胞类型鉴定？
单细胞分析实录(14): 细胞类型注释的另一种思路 — Cell
单细胞CNV分析-copyKAT
神兵利器——单细胞细胞类群基因marker鉴定新方法：COSG
SingleR如何使用自定义的参考集
单细胞数据的导入与质控 - Seurat
【单细胞】SingleR注释细胞类型
SingleR单细胞类型自动注释与celldex参考数据包
细胞类型重复性验证----MetaNeighbor（1）

这一实践教程针对的是10 X Genomics测序后经Cell Ranger处理的文件，所有分析都从barcodes.tsv.gz, features.tsv.gz, matrix.mtx.gz三个文件开始～

源数据：https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE204796

在GEO下载文件后的linux处理命令（修改文件名）：


for i in {"GSM6194008_D8","GSM6194009_D14","GSM6194010_D21","GSM6194011_D28","GSM6194012_D35"};
do 
mkdir $i;mv $i*.gz $i/;
mv $i/$i""_barcodes.tsv.gz $i/barcodes.tsv.gz;
mv $i/$i""_genes.tsv.gz $i/features.tsv.gz;
mv $i/$i""_matrix.mtx.gz $i/matrix.mtx.gz;
done

下面就是R语言中的操作啦：

#1. install and load packages
install.packages("tidyverse");install.packages("Matrix")
install.packages("RCurl");install.packages("scales");install.packages("purrr")
install.packages("cowplot");install.packages("BiocManager");
install.packages("Seurat");install.packages("metap");install.packages("stringr");
BiocManager::install("SingleCellExperiment");BiocManager::install("AnnotationHub")
BiocManager::install("ensembldb");BiocManager::install("multtest")
BiocManager::install("glmGamPoi")

library(SingleCellExperiment);library(Seurat);library(tidyverse)
library(Matrix);library(scales);library(cowplot);library(RCurl)
library(metap);library(stringr);library(dplyr);library(purrr)

#2. import file
## data source: GSE204796
## read 10X data and generate Seurat object
for (file in c("GSM6194008_D8","GSM6194009_D14")){
        seurat_data <- Read10X(data.dir = paste0("~/Downloads/scRNAseq/", file))
        seurat_obj <- CreateSeuratObject(counts = seurat_data,min.features = 100, 
                        project = file)
        assign(file, seurat_obj)
}
### Create a merged Seurat object
merged_seurat <- merge(x=GSM6194008_D8,GSM6194009_D14,add.cell.id = c("D8","D14"))

#3. Quality Control
## Cell-level filtering (1.Cell counts;2.UMI counts per cell;3.Genes detected per cell;4.Complexity (novelty score);5.Mitochondrial counts ratio)
head(merged_seurat@meta.data)

##                          orig.ident nCount_RNA nFeature_RNA
## D8_AAACCTGAGCTAGGCA-1 GSM6194008_D8      24792         4059
## D8_AAACCTGCAGTATGCT-1 GSM6194008_D8      36193         5260
## D8_AAACCTGGTTCAGACT-1 GSM6194008_D8      42816         5608
## D8_AAACCTGTCTGTTGAG-1 GSM6194008_D8      18663         3305
## D8_AAACGGGAGCGGCTTC-1 GSM6194008_D8      23840         3616
## D8_AAACGGGAGGTTACCT-1 GSM6194008_D8       1417          693


### Compute complexity (number of genes per UMI for each cell)
merged_seurat$log10GenesPerUMI <- log10(merged_seurat$nFeature_RNA)/log10(merged_seurat$nCount_RNA)
### Compute mitochondrial ratio
merged_seurat$mitoRatio <- PercentageFeatureSet(object = merged_seurat, pattern = "^MT-")/100
# Add cell IDs to metadata
metadata <- merged_seurat@meta.data
metadata$cells <- rownames(metadata)
# Create sample column
metadata$sample <- NA
metadata$sample[which(str_detect(metadata$cells, "^D8_"))] <- "D8"
metadata$sample[which(str_detect(metadata$cells, "^D14_"))] <- "D14"
merged_seurat@meta.data <- metadata
# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs

6a362f94d19d82716118a26018b57290.png

filtered_seurat <- subset(x = merged_seurat,subset= (nCount_RNA >= 500) &
                      (nFeature_RNA >= 250) & (log10GenesPerUMI > 0.80) &
                      (mitoRatio < 0.20))

## Gene-level filtering
counts <- GetAssayData(object = filtered_seurat, slot = "counts")
# Only keeping those genes expressed in more than 10 cells
keep_genes <- Matrix::rowSums(counts > 0) >= 10
filtered_counts <- counts[keep_genes, ]
# Reassign to filtered Seurat object
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = filtered_seurat@meta.data)

#4. normalize
## remove cell cycle effect 
seurat_phase <- NormalizeData(filtered_seurat)
# Load cell cycle markers (https://github.com/satijalab/seurat/blob/master/data/)
load("xxx/seurat-master/data/cc.genes.updated.2019.rda")
# Score cells for cell cycle
seurat_phase <- CellCycleScoring(seurat_phase, 
                    g2m.features=cc.genes.updated.2019$g2m.genes, 
                    s.features = cc.genes.updated.2019$s.genes)
# Identify the most variable genes
seurat_phase <- FindVariableFeatures(seurat_phase, 
                     selection.method = "vst",
                     nfeatures = 2000, 
                     verbose = FALSE)
# Scale the counts
seurat_phase <- ScaleData(seurat_phase)
# Perform PCA
seurat_phase <- RunPCA(seurat_phase)
# Plot the PCA colored by cell cycle phase
DimPlot(seurat_phase,reduction = "pca",group.by= "Phase",split.by = "Phase")

62fed5b45d5d4137717fe86ac6288986.png


### normalization using SCTransform
split_seurat <- SplitObject(seurat_phase, split.by = "sample")
split_seurat <- split_seurat[c("D8","D14")]
options(future.globals.maxSize = 4000 * 1024^2)
for (i in 1:length(split_seurat)) {
    split_seurat[[i]] <- SCTransform(split_seurat[[i]], vars.to.regress = c("mitoRatio"),conserve.memory = TRUE)
    }

#5. Integration
# Select the most variable features to use for integration
integ_features <- SelectIntegrationFeatures(object.list = split_seurat, 
                                            nfeatures = 3000) 
# Prepare the SCT list object for integration
split_seurat <- PrepSCTIntegration(object.list = split_seurat, 
                                   anchor.features = integ_features)
# Find best buddies - can take a while to run
integ_anchors <- FindIntegrationAnchors(object.list = split_seurat, 
                                        normalization.method = "SCT", 
                                        anchor.features = integ_features)
# Integrate across conditions
seurat_integrated <- IntegrateData(anchorset = integ_anchors, 
                                   normalization.method = "SCT")
# Run PCA
seurat_integrated <- RunPCA(object = seurat_integrated)
# Run UMAP
seurat_integrated <- RunUMAP(seurat_integrated, dims = 1:40,reduction = "pca")
# Plot UMAP                  
DimPlot(seurat_integrated)

d79523790fc1223e3f26f60cf0bf8e04.png


#5. Clustering
## PCA (feature selection)
##filtered_seurat <- RunPCA(filtered_seurat,verbose = FALSE)
ElbowPlot(object = seurat_integrated, ndims = 40)
# ref: https://hbctraining.github.io/scRNA-seq_online/lessons/elbow_plot_metric.html

5993f63ba6f93bf04862b3a4f3b21f12.png


seurat_integrated <- FindNeighbors(object = seurat_integrated, dims = 1:20, verbose = FALSE)
seurat_integrated <- FindClusters(object = seurat_integrated,resolution = c(0.4, 0.6, 0.8, 1.0, 1.4),verbose = FALSE)

# Assign identity of clusters
seurat_integrated <- RunUMAP(object = seurat_integrated,reduction = "pca",dims = 1:20,verbose = FALSE)

DimPlot(object = seurat_integrated,reduction = "umap",label = TRUE)

Idents(object = seurat_integrated) <- "integrated_snn_res.0.8"
# Plot the UMAP
DimPlot(seurat_integrated,reduction = "umap",label = TRUE,label.size = 6)
# UMAP of cells in each cluster by sample
DimPlot(seurat_integrated, label = TRUE, split.by = "sample")+NoLegend()
# Explore whether clusters segregate by cell cycle phase
DimPlot(seurat_integrated,label = TRUE,split.by = "Phase")+NoLegend()

c964f2b79c996b67fcf1c4592c666518.png

4158741295a6fce79e988cbcd769d72d.png

da875f45dd7e3215365fee04a8ca9e14.png

#6. Marker identification (cluster annotation)
#ref:https://hbctraining.github.io/scRNA-seq_online/lessons/09_merged_SC_marker_identification.html
# Select the RNA counts slot to be the default assay
DefaultAssay(seurat_integrated) <- "RNA"
# Normalize RNA data for visualization purposes
seurat_integrated <- NormalizeData(seurat_integrated, verbose = FALSE)
# Find markers for every cluster compared to all remaining cells, report only the positive ones
markers <- FindAllMarkers(object = seurat_integrated, 
                          only.pos = TRUE,min.diff.pct = 0.3,
                          logfc.threshold = 0.25)            
DefaultAssay(seurat_integrated) <- "RNA"

cluster1_conserved_markers <- FindConservedMarkers(seurat_integrated,
                              ident.1 = 1,
                             grouping.var = "sample",only.pos = TRUE,
                  logfc.threshold = 0.25)
# Extract top 10 markers per cluster
get_conserved <- function(cluster){
  FindConservedMarkers(seurat_integrated,ident.1 = cluster,
        grouping.var = "sample",only.pos = TRUE) %>% cbind(cluster_id = cluster, .)
}
conserved_markers <- map_dfr(c(2,14,15), get_conserved)
top10 <- conserved_markers %>% cbind %>% 
  mutate(avg_fc = (D8_avg_log2FC + D14_avg_log2FC) /2) %>% 
  group_by(cluster_id) %>% 
  top_n(n = 10, wt = avg_fc)
# Plot interesting marker gene expression for cluster 20
FeaturePlot(seurat_integrated, 
            reduction = "umap", 
            features=c("AQP4","GFAP","MBP","PLP1","VCAN","BCAN","FLT1","CLDN5"), 
            order = TRUE,
            min.cutoff = 'q10', 
            label = TRUE)
# Vln plot - cluster 20
VlnPlot(object = seurat_integrated, 
        features = c("FGF17","FGF8","TPSAB1", "TPSB2", "FCER1A", "GATA1", "GATA2"))

c9ada65e718e219cc8725f8bbd10055f.png

b7e773f83f7bf79c5d37edb24558df6f.png


# define clusters
seurat_integrated <- RenameIdents(object = seurat_integrated, 
                               "0" = "Naive or memory CD4+ T cells",
                               "1" = "CD14+ monocytes",
                               "2" = "Naive or memory CD4+ T cells",
                               "3" = "CD14+ monocytes",
                               "4" = "CD4+ T cells",
                               "5" = "CD8+ T cells",
                               "6" = "B cells",
                               "7" = "Stressed cells / Activated T cells",
                               "8" = "NK cells",
                               "9" = "FCGR3A+ monocytes",
                               "10" = "CD4+ T cells",
                               "11" = "B cells",
                               "12" = "NK cells",
                               "13" = "CD8+ T cells",
                               "14" = "CD14+ monocytes",
                               "15" = "Conventional dendritic cells",
             "16" = "Megakaryocytes",
             "17" = "B cells", 
             "18" = "CD4+ T cells")
# Plot the UMAP
DimPlot(object = seurat_integrated, 
        reduction = "umap", 
        label = TRUE,
        label.size = 3,
        repel = TRUE)

3631a6f07dd2b5f15cd3090fde21ff27.png
Detailed explanation (video):****视频讲解

link:https://www.bilibili.com/video/BV1gB4y177n9?spm_id_from=333.999.0.0

References:

1. https://github.com/hbctraining/scRNA-seq 2. cell taxonomy：https://ngdc.cncb.ac.cn/celltaxonomy/

单细胞转录组细胞类型注释分析合集
SingleR 使用SingleR包进行单细胞类型注释分析 Celaref 使用Celaref包进行单细胞类型注释...
GSEA也能做细胞类型鉴定？
单细胞数据挖掘||Garnett :基于监督学习的细胞注释算法 10X 单细胞细胞类型手动定义||Identify...
单细胞分析实录(14): 细胞类型注释的另一种思路 — Cell
之前写过一篇细胞类型注释的推文：单细胞分析实录(7): 差异表达分析/细胞类型注释[https://www.jia...
单细胞CNV分析-copyKAT
对于肿瘤的单细胞样本，在做完细胞质控和细胞类型鉴定之后，可以进入CNV分析。 CNV(Copy number va...
神兵利器——单细胞细胞类群基因marker鉴定新方法：COSG
在单细胞数据分析当中，当我们聚类分群完成之后，紧接着就是细胞类型注释，细胞类型的注释就离不开基因marker ，...
SingleR如何使用自定义的参考集
在我之前的帖子单细胞分析实录(7): 差异表达分析/细胞类型注释里面，我已经介绍了如何使用SingleR给单细胞数...
单细胞数据的导入与质控 - Seurat
###################################### 题目：单细胞数据的导入与质控 - S...
【单细胞】SingleR注释细胞类型
SingleR 教程网址（https://www.bioconductor.org/packages/releas...
SingleR单细胞类型自动注释与celldex参考数据包
SingleR包可基于参考数据集，实现对单细胞数据细胞类型的自动注释。celldex包提供若干常用的人/老鼠的注释...
细胞类型重复性验证----MetaNeighbor（1）
单细胞转录组的出现加快了细胞类型多样性的研究，以及一些稀有细胞类型的发现。但是，细胞类型的注释则因为不同数据集以及...