在读张泽民老师发表的新冠文献的时候看到计算免疫细胞的cytokine score或inflammatory score使用了Seurat包的AddMouduleScore
函数,在计算细胞周期的CellCycleScoring函数的原代码中也使用了这个函数。
此功能可用于计算任何基因列表的监督模块评分,非常实用!!!
1. 查看一下这个函数的用法
library(Seurat)
?AddModuleScore
2. 使用
输入数据:Seurat对象和一个gene list。
- 1 准备矩阵
library(tidyverse)
library(Matrix)
library(cowplot)
pbmc <- readRDS("pbmc.rds")
- 2 准备想要计算的gene list
inflammatory_gene <- readxl::read_xlsx("inflammatory_gene.xlsx")
View(inflammatory_gene)
#转换成list
gene <- as.list(inflammatory_gene)
- 3 计算
pbmc <- AddModuleScore(
object = pbmc,
features = gene,
ctrl = 100, #默认值是100
name = 'CD_Features'
)
计算结果保存在pbmc@meta.data[["CD_Features1"]]
得到的score是在每个细胞中算出来的我们感兴趣的基因的表达均值
。
其在使用时需要先计算基因集中所有基因的平均值,再根据平均值把表达矩阵切割成若干个bin,然后从切割后的每一个bin随机抽取对照基因(基因集外的基因,默认100个)作为背景值。最后所有的目标基因算一个平均值,所有的背景基因算一个平均值,两者相减就是该gene set的score值。因此,在整合不同样本的情况下,即使使用相同基因集为相同细胞打分,也会产生不同的富集评分。从本质上看它和Zscore的算法很类似,Zscore又称Z值,原是一个统计学概念,表示的是个体测量值X以标准差σ为单位,偏离总体均数μ的距离,即:Z score=(X-μ)/σ。牵扯到统计学的概念不免有些难以理解,简单说它就是处理过的平均值。
colnames(pbmc@meta.data)
# [1] "orig.ident" "nCount_RNA" "nFeature_RNA"
# [4] "percent.mt" "RNA_snn_res.0.5" "seurat_clusters"
# [7] "cell_type" "CD_Features1"
colnames(pbmc@meta.data)[8] <- 'Inflammatory_Score'
VlnPlot(pbmc,features = 'Inflammatory_Score')
绘制箱线图
data<- FetchData(pbmc,vars = c("cell_type","Inflammatory_Score"))
p <- ggplot(data,aes(cell_type,Inflammatory_Score))
p+geom_boxplot()+theme_bw()+RotatedAxis()
绘制分数的分布图
library(ggplot2)
mydata<- FetchData(pbmc,vars = c("UMAP_1","UMAP_2","Inflammatory_Score"))
a <- ggplot(mydata,aes(x = UMAP_1,y =UMAP_2,colour = Inflammatory_Score))+geom_point(size = 1)+scale_color_gradientn(values = seq(0,1,0.2),colours = c('blue','cyan','green','yellow','orange','red'))
a+ theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
打分是一个连续变化的值,我们也可以直接将其转换为一定的分类变量:
#按照均值分
pbmc[["stage"]] <- ifelse(pbmc@meta.data[,"Inflammatory_Score"] > mean(pbmc@meta.data[,"Inflammatory_Score"]),"High","Low")
rownames(pbmc@meta.data)
#按照75%分
pbmc[["stage"]] <-pbmc@meta.data[,"Inflammatory_Score"] <- ifelse(pbmc@meta.data[,"Inflammatory_Score"] > quantile(pbmc@meta.data[,"Inflammatory_Score"],0.75),"High","Low")
#按照具体数值分
pbmc[["stage"]] <-pbmc@meta.data[,"Inflammatory_Score"] <- ifelse(pbmc@meta.data[,"Inflammatory_Score"] > 0.5,"High","Low")
Idents(pbmc) <- 'Inflammatory_Score'
DimPlot(pbmc,reduction = "umap",label = TRUE,pt.size = 1.5)
3. 主要参数
参数 | 意义 |
---|---|
object | Seurat object |
features | A list of vectors of features for expression programs; each entry should be a vector of feature names |
pool | List of features to check expression levels against, defaults to rownames(x = object) |
nbin | Number of bins of aggregate expression levels for all analyzed features |
ctrl | Number of control features selected from the same bin per analyzed feature |
k | Use feature clusters returned from DoKMeans |
assay | Name of assay to use |
name | Name for the expression programs; will append a number to the end for each entry in features (eg. if features has three programs, the results will be stored as name1, name2, name3, respectively) |
seed | Set a random seed. If NULL, seed is not set. |
search | Search for symbol synonyms for features in features that don't match features in object ? Searches the HGNC's gene names database; see UpdateSymbolList for more details |
... | Extra parameters passed to UpdateSymbolList
|
4. 函数代码
AddModuleScore
function (object, features, pool = NULL, nbin = 24, ctrl = 100,
k = FALSE, assay = NULL, name = "Cluster", seed = 1, search = FALSE,
...)
{
if (!is.null(x = seed)) {
set.seed(seed = seed)
}
assay.old <- DefaultAssay(object = object)
assay <- assay %||% assay.old
DefaultAssay(object = object) <- assay
assay.data <- GetAssayData(object = object)
features.old <- features
if (k) {
.NotYetUsed(arg = "k")
features <- list()
for (i in as.numeric(x = names(x = table(object@kmeans.obj[[1]]$cluster)))) {
features[[i]] <- names(x = which(x = object@kmeans.obj[[1]]$cluster ==
i))
}
cluster.length <- length(x = features)
}
else {
if (is.null(x = features)) {
stop("Missing input feature list")
}
features <- lapply(X = features, FUN = function(x) {
missing.features <- setdiff(x = x, y = rownames(x = object))
if (length(x = missing.features) > 0) {
warning("The following features are not present in the object: ",
paste(missing.features, collapse = ", "), ifelse(test = search,
yes = ", attempting to find updated synonyms",
no = ", not searching for symbol synonyms"),
call. = FALSE, immediate. = TRUE)
if (search) {
tryCatch(expr = {
updated.features <- UpdateSymbolList(symbols = missing.features,
...)
names(x = updated.features) <- missing.features
for (miss in names(x = updated.features)) {
index <- which(x == miss)
x[index] <- updated.features[miss]
}
}, error = function(...) {
warning("Could not reach HGNC's gene names database",
call. = FALSE, immediate. = TRUE)
})
missing.features <- setdiff(x = x, y = rownames(x = object))
if (length(x = missing.features) > 0) {
warning("The following features are still not present in the object: ",
paste(missing.features, collapse = ", "),
call. = FALSE, immediate. = TRUE)
}
}
}
return(intersect(x = x, y = rownames(x = object)))
})
cluster.length <- length(x = features)
}
if (!all(LengthCheck(values = features))) {
warning(paste("Could not find enough features in the object from the following feature lists:",
paste(names(x = which(x = !LengthCheck(values = features)))),
"Attempting to match case..."))
features <- lapply(X = features.old, FUN = CaseMatch,
match = rownames(x = object))
}
if (!all(LengthCheck(values = features))) {
stop(paste("The following feature lists do not have enough features present in the object:",
paste(names(x = which(x = !LengthCheck(values = features)))),
"exiting..."))
}
pool <- pool %||% rownames(x = object)
data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
data.avg <- data.avg[order(data.avg)]
data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e+30,
n = nbin, labels = FALSE, right = FALSE)
names(x = data.cut) <- names(x = data.avg)
ctrl.use <- vector(mode = "list", length = cluster.length)
for (i in 1:cluster.length) {
features.use <- features[[i]]
for (j in 1:length(x = features.use)) {
ctrl.use[[i]] <- c(ctrl.use[[i]], names(x = sample(x = data.cut[which(x = data.cut ==
data.cut[features.use[j]])], size = ctrl, replace = FALSE)))
}
}
ctrl.use <- lapply(X = ctrl.use, FUN = unique)
ctrl.scores <- matrix(data = numeric(length = 1L), nrow = length(x = ctrl.use),
ncol = ncol(x = object))
for (i in 1:length(ctrl.use)) {
features.use <- ctrl.use[[i]]
ctrl.scores[i, ] <- Matrix::colMeans(x = assay.data[features.use,
])
}
features.scores <- matrix(data = numeric(length = 1L), nrow = cluster.length,
ncol = ncol(x = object))
for (i in 1:cluster.length) {
features.use <- features[[i]]
data.use <- assay.data[features.use, , drop = FALSE]
features.scores[i, ] <- Matrix::colMeans(x = data.use)
}
features.scores.use <- features.scores - ctrl.scores
rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
features.scores.use <- as.data.frame(x = t(x = features.scores.use))
rownames(x = features.scores.use) <- colnames(x = object)
object[[colnames(x = features.scores.use)]] <- features.scores.use
CheckGC()
DefaultAssay(object = object) <- assay.old
return(object)
}
<bytecode: 0x7fc8df13f008>
<environment: namespace:Seurat>
网友评论