1. 概述
恶性肿瘤组织不仅包括肿瘤细胞,还包括肿瘤相关的正常上皮细胞和基质细胞、免疫细胞和血管细胞。而浸润性基质细胞和免疫细胞是肿瘤组织中正常细胞的主要组成部分,不仅在分子研究中干扰肿瘤信号,而且在肿瘤生物学中具有重要的作用。越来越多的证据表明,肿瘤相关的正常细胞的浸润影响了通过基因组方法(如基因表达谱或拷贝数数据)对临床肿瘤样本的分析,而对结果的生物学解释需要相当重视样本的异质性。基于DNA拷贝数的肿瘤纯度估计在预测肿瘤样本纯度方面正迅速得到重视,产生了如ASCAT与ABSOLUTE等的经典算法,然而这些方法仅限于具有可用的拷贝数文件的样本。因此作者提出了一种新的算法,利用癌症样本的转录图谱的独特性质来推断肿瘤细胞的性质以及不同的浸润正常细胞,即Estimation of STromal and Immune cells in MAlignant Tumours using Expression data (ESTIMATE)。文章2013年发表于Nature Communications杂志,题目为Inferring tumour purity and stromal and immune cell admixture from expression data。
2. 算法
作者为了分析来自6个不同平台的表达数据,选择了10412个常见基因,经过5步筛选,得出包含141个基因的Stromal signature和包含141个基因的Immune signature,之后用ssGSEA方法即可计算得到相应的Stromal score与Immune score,两者结合得到Estimate score,通过Estimate score预测肿瘤纯度。文章在算法确定后又通过各种数据进行了一系列的验证,整体相对而言较好理解,也有现成的estimate R包可以直接安装使用。
算法流程3. 实现
R包estimate被放置在R-Forge上,可通过如下代码安装:
library(utils)
rforge <- "http://r-forge.r-project.org"
install.packages("estimate", repos = rforge, dependencies = TRUE)
library(estimate)
或者将tar.gz格式的R包estimate_1.0.13.tar.gz下载到本地,放置在R当前工作目录下,按以下代码安装:
install.packages("estimate_1.0.13.tar.gz", repos = NULL, type = "source")
library(estimate)
将包装在R库之后,即可到库中estimate包下的doc文件夹中寻找到ESTIMATE_Vignette.pdf,初步运行自带的测试数据。
library(estimate)
OvarianCancerExpr <- system.file("extdata", "sample_input.txt", package="estimate")
filterCommonGenes(input.f=OvarianCancerExpr, output.f="OV_10412genes.gct", id="GeneSymbol")
estimateScore(input.ds="OV_10412genes.gct", output.ds="OV_estimate_score.gct", platform="affymetrix")
plotPurity(scores="OV_estimate_score.gct", samples="s519", platform="affymetrix")
如想将.gct格式的结果文件转换为.csv或.txt的文本文件,可用下面代码整理:
estimate_score <- read.table("OV_estimate_score.gct", skip = 2, header = TRUE)
rownames(estimate_score) <- estimate_score[,1]
estimate_score <- estimate_score[,3:ncol(estimate_score)]
estimate_score
得到对象estimate_score后即可进行后续保存。
另外,库中estimate包下的extdata文件夹中包含有具体的141个Stromal signature基因和141个Immune signature基因的文件SI_geneset.gmt,可直接点击查看。
注:
R包estimate编写并不复杂,可通读其源代码,作为一次很好的提高R语言水平的机会,尤其是ESTIMATE算法的实现函数estimateScore()
,包含了ssGSEA算法的代码实现,值得好好学习,列在下面:
function (input.ds, output.ds, platform = c("affymetrix", "agilent",
"illumina"))
{
stopifnot(is.character(input.ds) && length(input.ds) == 1 &&
nzchar(input.ds))
stopifnot(is.character(output.ds) && length(output.ds) ==
1 && nzchar(output.ds))
platform <- match.arg(platform)
ds <- read.delim(input.ds, header = TRUE, sep = "\t", skip = 2,
row.names = 1, blank.lines.skip = TRUE, as.is = TRUE,
na.strings = "")
descs <- ds[, 1]
ds <- ds[-1]
row.names <- row.names(ds)
names <- names(ds)
dataset <- list(ds = ds, row.names = row.names, descs = descs,
names = names)
m <- data.matrix(dataset$ds)
gene.names <- dataset$row.names
sample.names <- dataset$names
Ns <- length(m[1, ])
Ng <- length(m[, 1])
temp <- strsplit(input.ds, split = "/")
s <- length(temp[[1]])
input.file.name <- temp[[1]][s]
temp <- strsplit(input.file.name, split = ".gct")
input.file.prefix <- temp[[1]][1]
for (j in 1:Ns) {
m[, j] <- rank(m[, j], ties.method = "average")
}
m <- 10000 * m/Ng
gs <- as.matrix(SI_geneset[, -1], dimnames = NULL)
N.gs <- 2
gs.names <- row.names(SI_geneset)
score.matrix <- matrix(0, nrow = N.gs, ncol = Ns)
for (gs.i in 1:N.gs) {
gene.set <- gs[gs.i, ]
gene.overlap <- intersect(gene.set, gene.names)
print(paste(gs.i, "gene set:", gs.names[gs.i], " overlap=",
length(gene.overlap)))
if (length(gene.overlap) == 0) {
score.matrix[gs.i, ] <- rep(NA, Ns)
next
}
else {
ES.vector <- vector(length = Ns)
for (S.index in 1:Ns) {
gene.list <- order(m[, S.index], decreasing = TRUE)
gene.set2 <- match(gene.overlap, gene.names)
correl.vector <- m[gene.list, S.index]
TAG <- sign(match(gene.list, gene.set2, nomatch = 0))
no.TAG <- 1 - TAG
N <- length(gene.list)
Nh <- length(gene.set2)
Nm <- N - Nh
correl.vector <- abs(correl.vector)^0.25
sum.correl <- sum(correl.vector[TAG == 1])
P0 <- no.TAG/Nm
F0 <- cumsum(P0)
Pn <- TAG * correl.vector/sum.correl
Fn <- cumsum(Pn)
RES <- Fn - F0
max.ES <- max(RES)
min.ES <- min(RES)
if (max.ES > -min.ES) {
arg.ES <- which.max(RES)
}
else {
arg.ES <- which.min(RES)
}
ES <- sum(RES)
EnrichmentScore <- list(ES = ES, arg.ES = arg.ES,
RES = RES, indicator = TAG)
ES.vector[S.index] <- EnrichmentScore$ES
}
score.matrix[gs.i, ] <- ES.vector
}
}
score.data <- data.frame(score.matrix)
names(score.data) <- sample.names
row.names(score.data) <- gs.names
estimate.score <- apply(score.data, 2, sum)
if (platform != "affymetrix") {
score.data <- rbind(score.data, estimate.score)
rownames(score.data) <- c("StromalScore", "ImmuneScore",
"ESTIMATEScore")
}
else {
convert_row_estimate_score_to_tumor_purity <- function(x) {
stopifnot(is.numeric(x))
cos(0.6049872018 + 0.0001467884 * x)
}
est.new <- NULL
for (i in 1:length(estimate.score)) {
est_i <- convert_row_estimate_score_to_tumor_purity(estimate.score[i])
est.new <- rbind(est.new, est_i)
if (est_i >= 0) {
next
}
else {
message(paste(sample.names[i], ": out of bounds",
sep = ""))
}
}
colnames(est.new) <- c("TumorPurity")
estimate.t1 <- cbind(estimate.score, est.new)
x.bad.tumor.purities <- estimate.t1[, "TumorPurity"] <
0
estimate.t1[x.bad.tumor.purities, "TumorPurity"] <- NA
score.data <- rbind(score.data, t(estimate.t1))
rownames(score.data) <- c("StromalScore", "ImmuneScore",
"ESTIMATEScore", "TumorPurity")
}
outputGCT(score.data, output.ds)
}
网友评论