GSE117570

1.下载和读取数据

rm(list = ls())
library(stringr)
library(Seurat)
library(dplyr)
f = dir("GSE117570_RAW/");f
## [1] "GSM3304007_P1_Tumor_processed_data.txt.gz"
## [2] "GSM3304011_P3_Tumor_processed_data.txt.gz"
## [3] "GSM3304013_P4_Tumor_processed_data.txt.gz"
s = str_split(f,"_",simplify = T)[,1];s
## [1] "GSM3304007" "GSM3304011" "GSM3304013"
scelist = list()
for(i in 1:length(f)){
  tmp = read.table(paste0("GSE117570_RAW/",f[[i]]),
                 check.names = F)
  tmp = as.matrix(tmp)

  scelist[[i]] <- CreateSeuratObject(counts = tmp, 
                           project = s[[i]], 
                           min.cells = 3, 
                           min.features = 50)
  scelist[[i]][["percent.mt"]] <- PercentageFeatureSet(scelist[[i]], pattern = "^MT-")
  scelist[[i]] <- subset(scelist[[i]], subset = percent.mt < 5)
}
names(scelist)  = s
lapply(scelist, function(x)dim(x@assays$RNA@counts))
## $GSM3304007
## [1] 6611 1466
## 
## $GSM3304011
## [1] 3211  116
## 
## $GSM3304013
## [1] 7492  281

2.合并到一起，找高变基因

sce.all <- merge(scelist[[1]],
                 y= scelist[ -1 ] ,
                 add.cell.ids =  s) 

# 查看数据
head(sce.all@meta.data, 10)
##                               orig.ident nCount_RNA nFeature_RNA percent.mt
## GSM3304007_AAACCTGGTACAGACG-1 GSM3304007       4338         1224   2.512679
## GSM3304007_AAACGGGGTAGCGCTC-1 GSM3304007      11724         2456   2.021494
## GSM3304007_AAACGGGGTCCTCTTG-1 GSM3304007       3353          726   2.117507
## GSM3304007_AAACGGGTCCAAACAC-1 GSM3304007       2811          986   2.312344
## GSM3304007_AAACGGGTCTTTAGTC-1 GSM3304007       6095         1590   3.002461
## GSM3304007_AAAGATGCATCTACGA-1 GSM3304007       7683         2049   2.824418
## GSM3304007_AAAGATGCATTGAGCT-1 GSM3304007       2428          832   2.800659
## GSM3304007_AAAGATGGTTCAGTAC-1 GSM3304007       2939         1052   3.028241
## GSM3304007_AAAGATGTCTGGTATG-1 GSM3304007       6834         1452   3.906936
## GSM3304007_AAAGCAAAGACTACAA-1 GSM3304007       3429          737   2.391368
table(sce.all@meta.data$orig.ident) 
## 
## GSM3304007 GSM3304011 GSM3304013 
##       1466        116        281
VlnPlot(sce.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

FeatureScatter(sce.all, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

sce.all <- NormalizeData(sce.all)
sce.all <- FindVariableFeatures(sce.all)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(sce.all), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(sce.all)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot2

3. PCA

完成PCA线性降维，是后续其他降维方法的基础

sce.all <- ScaleData(sce.all)
sce.all[["RNA"]]@counts[1:4,1:4]
## 4 x 4 sparse Matrix of class "dgCMatrix"
##            GSM3304007_AAACCTGGTACAGACG-1 GSM3304007_AAACGGGGTAGCGCTC-1
## FO538757.2                             .                             .
## AP006222.2                             .                             .
## NOC2L                                  .                             .
## ISG15                                 11                             3
##            GSM3304007_AAACGGGGTCCTCTTG-1 GSM3304007_AAACGGGTCCAAACAC-1
## FO538757.2                             .                             .
## AP006222.2                             .                             .
## NOC2L                                  .                             .
## ISG15                                  .                             4
sce.all <- RunPCA(sce.all, features = VariableFeatures(sce.all))
DimPlot(sce.all, reduction = "pca")

# 应该选多少个主成分进行后续分析
ElbowPlot(sce.all)

sce.all <- JackStraw(sce.all, num.replicate = 100)
sce.all <- ScoreJackStraw(sce.all, dims = 1:20)
JackStrawPlot(sce.all, dims = 1:20)

4.harmony和UMAP

harmony用于整合数据时消除样本间的差异，上一篇是用Seurat自带的CCA方法整合。

library(harmony)
sce.all <- RunHarmony(sce.all, "orig.ident")
names(sce.all@reductions)
## [1] "pca"     "harmony"
sce.all <- RunUMAP(sce.all,  dims = 1:15, 
                     reduction = "harmony")
DimPlot(sce.all,reduction = "umap",label=T )

sce.all <- FindNeighbors(sce.all,reduction = "harmony",dims = 1:15)
sce.all <- FindClusters(sce.all, resolution = 0.2) #分辨率
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1863
## Number of edges: 64311
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9515
## Number of communities: 9
## Elapsed time: 0 seconds
# 结果聚成几类，用Idents查看
length(levels(Idents(sce.all)))
## [1] 9
table(sce.all@active.ident) 
## 
##   0   1   2   3   4   5   6   7   8 
## 667 382 246 172 127 101  81  57  30
DimPlot(sce.all,reduction = "umap",label=T )

5.SingleR注释

# 注释
library(celldex)
library(SingleR)
#ref <- celldex::HumanPrimaryCellAtlasData()
#因为下载太慢，使用了提前存好的本地数据
ref <- get(load("../single_ref/ref_Hematopoietic.RData"))
library(BiocParallel)
pred.scRNA <- SingleR(test = sce.all@assays$RNA@data, 
                      ref = ref,
                      labels = ref$label.main, 
                      clusters = sce.all@active.ident)
pred.scRNA$pruned.labels
## [1] "Monocytes"       "CD4+ T cells"    "Erythroid cells" "B cells"        
## [5] "Monocytes"       "CD4+ T cells"    "CD8+ T cells"    "HSCs"           
## [9] "HSCs"
plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)

new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(sce.all)
levels(sce.all)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8"
sce.all <- RenameIdents(sce.all,new.cluster.ids)
levels(sce.all)
## [1] "Monocytes"       "CD4+ T cells"    "Erythroid cells" "B cells"        
## [5] "CD8+ T cells"    "HSCs"
UMAPPlot(object = sce.all, pt.size = 0.5, label = TRUE)