-
GEO 数据集的差异分析
语言:R
差异分析包:limma
前提:GSE数据集内有数据,GPL数据有数据
“Series Matrix Files” 文件下载确认一下,是否有数据,有一些是只有title并没有数据的 (ps.空文件会报错)
image.png
GPL数据也下载看一下,注释的信息如果比较乱,可以自己整理一下
library(GEOquery)
library(limma)
GSE_id = "" #数据集
GPL_id="" #探针注释集
gset <- getGEO(GSE_id, GSEMatrix = TRUE, AnnotGPL = FALSE)
if (length(gset) >1 ) idx <- grep(GPL_id, attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
fvarLabels(gset) <- make.names(fvarLabels(gset))
gsms <- "1111XXXX0000XXXX"
sml <- strsplit(gsms, split="")[[1]]
# filter out excluded samples
sel <- which(sml != "X")
sml <- sml[sel]
gset <- gset[,sel]
#logs transformation
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0, 0.25, 0.5, 0.75, 0.99, 1), na.rm=T))
LogC <- (qx[5] >100) ||
(qx[6] - qx[1] >50 && qx[2] >0)
if (LogC) { ex[which(ex <= 0)] <- NaN
+ exprs(gset) <- log2(ex) }
gs <- factor(sml)
groups <- make.names(c("test","ctl"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
fit <- lmFit(gset, design) #error:如果数据为空,会报错
cts <- paste(groups[1], groups[2], sep="-")
cont.matrix <- makeContrasts(contrasts = cts, levels = design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by = "B", number=250)
tT <- subset(tT, select=c("ID","adj.P.val", "P.value","t","B","logFC","Gene.symbol","Gene.title"))
write(tT, file="outer.txt", row.name=F, sep="\t")
tT<- subset(tT, Gene.symbol !="NA" & Gene.symbol!="", select=c("adj.P.Val","P.Value","t","B","logFC","Gene.symbol","AveExpr"))
#标记tag
tT$change[tT$logFC>0 & tT$P.value<0.05] ="up"
tT$change[tT$logFC<0 & tT$P.value<0.05] = "down"
tT$change[tT$P.value >0.05 & (tT$logFC>=0 | tT$logFC<=0)] = "NONE"
tT$abs.logFC=abs(tT$logFC)
write(tT, file="outer2.txt", row.names=F,sep="\t")
差异分析完成的outer.txt 可以用来做后续分析
网友评论