作为实验室生信分析的二把刀,狗哥早上给我发了一个数据说让我帮他看看
我打开一看是安捷伦的芯片数据,ono傻眼了
开始搜教程,建明老师yyds
https://mp.weixin.qq.com/s/k1Br-W-ukKFMg9PBMqUt9A
贴上我的代码保存一下,以后发现错误了及时改正
数据是GSE111974,进去芯片是GPL17077
数据我下载的GSE里的,R里用包下载我一直报错
芯片注释的匹配用excel整的,毕竟我是个R都搞不明白的人
#-------------读入原始文件路径--------------------------------
files = dir(path = ".",pattern = "txt",full.names = T,recursive = T)
x <- read.maimages(files, source="agilent", green.only=TRUE,other.columns = "gIsWellAboveBG")
#-------背景噪音矫正----------
data.bg <- backgroundCorrect(x,method = "normexp")#输出Large ElistRaw结果
#-------标准化-----------
#在这里需要注意,data.norm为Large Elist结果,此时表达量为log2形式保存。
data.norm <- normalizeBetweenArrays(data.bg,method = "quantile")
class(data.norm)
#------------基因过滤----------------
#https://mp.weixin.qq.com/s/k1Br-W-ukKFMg9PBMqUt9A
#去除对照探针
Control <- data.norm$genes$ControlType==1L;table(Control)
#去除匹配不到genesymbol的探针
NoSymbol <- is.na(data.norm$genes$ProbeName);table(NoSymbol)
#去表达探针(至少在一半样本中高于背景)
IsExpr <- rowSums(data.norm$other$gIsWellAboveBG > 0) >= 24;table(IsExpr)
#测到多次的基因,只保留一个探针
Isdup <- duplicated(data.norm$genes$ProbeName);table(Isdup)
datafilt <- data.norm[!control & !NoSymbol & IsExpr & !Isdup, ]
dim(datafilt)
exp = datafilt@.Data[[1]]
boxplot(exp)
exp[1:2,1:2]
#获得样本名
colnames(exp) = str_extract(colnames(exp),"GSM\\d*")
exp[1:2,1:2]
anno = datafilt$genes
nrow(anno);nrow(exp)
rownames(exp)=rownames(anno)
ids = unique(anno$ProbeName)
exp = exp[!duplicated(anno$ProbeName),]
rownames(exp) = ids
exp[1:4,1:4]
write.csv(exp,"data.csv")
#---差异分析---
group_list = c(rep('control',24),rep('RIF',24))
colData = data.frame(row.names = colnames(exp),group_list = group_list)
design <- model.matrix(~group_list)
fit <- lmFit(exp,design)
fit <- eBayes(fit,trend=TRUE,robust=TRUE)
summary(decideTests(fit))
deg = topTable(fit,coef=2,n=dim(data.norm)[1])
boxplot(exp[rownames(deg)[1],]~group_list)
save(exp,group_list,deg,gse,file = paste0(gse,"deg.Rdata"))
最终结果和文章里的FoldChange不一样,但是显著基因还是OK的
有时间了好好琢磨一下
网友评论