加载包,设置路径等:
# Differential expression analysis with limma
source("https://bioconductor.org/biocLite.R")
# biocLite()
#biocLite("sva")
getwd()
setwd("D:/RWork/Rworking")
library(Biobase)
library(GEOquery)
library(limma)
# load series and platform data from GEO
getwd()
setwd("D:/RWork")
dir.create("GEO2R")
setwd("./GEO2R")
下载数据,是表达矩阵:
# load series and platform data from GEO
getwd()
setwd("D:/RWork")
dir.create("GEO2R")
setwd("./GEO2R")
gset <- getGEO("GSE84846", GSEMatrix =TRUE, AnnotGPL=TRUE)
if (length(gset) > 1) idx <- grep("GPL6480", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
函数fvarLabels()
分组和样本筛选,剔除标记为“X”的样本,不符合纳入标准:
# group names for all samples
gsms <- paste0("1011000000200X221X102211000002XX21X0200X1100202001",
"0XX21001X21X1100021XX21020220222211X1220212202X00")
sml <- c()
for (i in 1:nchar(gsms)) { sml[i] <- substr(gsms,i,i) }
# eliminate samples marked as "X"
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[ ,sel]
筛选前99个样本 | 筛选后85个样本 |
---|
对数转换:
# log2 transform
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprs(gset) <- log2(ex) }
ex是一个矩阵
LogC的逻辑值为F
根据ex的分位数的特征(包括99分位数是否大于100,极差是否大于50等等),判断LogC的值为假,如此判断表达矩阵ex是一个已经过log2转换的矩阵,无需进行对数转换。
下面进行的是差异分析:
# set up the data and proceed with analysis
sml <- paste("Node", sml, sep="") # set group names
fl <- as.factor(sml)
gset$description <- fl
design <- model.matrix(~ description + 0, gset)
colnames(design) <- levels(fl)
fit <- lmFit(gset, design)
cont.matrix <- makeContrasts(G2-G0, G1-G0, G2-G1, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=10000)
Node2vs0 <- topTable(fit2, adjust="fdr",coef = 1, sort.by="B", number=10000)
Node1vs0 <- topTable(fit2, adjust="fdr",coef = 2, sort.by="B", number=10000)
Node2vs1 <- topTable(fit2, adjust="fdr",coef = 3, sort.by="B", number=10000)
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","Gene.symbol","Gene.title"))
write.table(tT, file=stdout(), row.names=F, sep="\t")
gset$description属性和design变量 | design |
---|
gset$description属性和design变量 | design |
---|
fit | fit2 |
---|
Bayes拟合后fit2 差异表达结果提取
注意比较三个变量列名的不同。。。
Node2vs0 <- topTable(fit2, adjust="BH",coef = 1, number=10000, p.value = 0.05)
Node2vs0 <- Node2vs0[, c(1, 3, 23, 24, 25, 26, 27, 28)]
head(Node2vs0)
Node1vs0 <- topTable(fit2, adjust="fdr",coef = 2, number=10000, p.value = 0.05)
#Node1vs0 <- Node1vs0[, c(1, 3, 23, 24, 25, 26, 27, 28)]
head(Node1vs0)
Node1vs0
Node2vs1 <- topTable(fit2, adjust="fdr",coef = 3, number=10000, p.value = 0.05)
#Node2vs1 <- Node2vs1[, c(1, 3, 23, 24, 25, 26, 27, 28)]
head(Node2vs1)
write.table(Node1vs0, file = "Node1vs0.txt", row.names=F, sep="\t")
write.table(Node2vs0, file = "Node2vs0.txt", row.names=F, sep="\t")
write.table(Node2vs1, file = "Node2vs1.txt", row.names=F, sep="\t")
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","F","Gene.symbol","Gene.title"))
write.table(tT, file=stdout(), row.names=F, sep="\t")
部分结果
出来的结果表明,I期和0期之间,差异基因不明显,表明林场分期对其混杂影响很大,需要优化分组。
网友评论
"0XX21001X21X1100021XX21020220222211X1220212202X00")”这个分组字符串是怎么得到的呢?谢谢!