简介
这是复现一篇结肠癌的文献图例,GSE4107,文献地址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4399548/pdf/ott-8-745.pdf,其中由于作者细节未公布,可能差异基因个数有稍许差别
数据提取
利用getGEO
函数下载GSE4107
,用exprs
函数获的exp
表达矩阵,pData
函数获取临床信息pd临床数据,@annotation
获得gpl
平台数据。
#数据下载
rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse,
destdir = '.',
getGPL = F)
#(1)提取表达矩阵exp
exp <- exprs(eSet[[1]])
exp[1:4,1:4]
exp = log2(exp+1)
#(2)提取临床信息
pd <- pData(eSet[[1]])
#(3)调整pd的行名顺序与exp列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
#(4)提取芯片平台编号
gpl <- eSet[[1]]@annotation
save(gse,pd,exp,gpl,file = "step1output.Rdata")
获取GPL平台信息
首先利用pd
中的title
信息获取临床信息并factor
化,再利用GPL
获取探针信息。
rm(list = ls())
load(file = "step1output.Rdata")
library(stringr)
pd$title
group_list=ifelse(str_detect(pd$title,"healthy control"),"control","treat")
#设置参考水平,对照在前,处理在后
group_list = factor(group_list,
levels = c("control","treat"))
#2.ids-----------------
#方法1 BioconductorR包
gpl
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL)
head(ids)
# 方法2 读取gpl页面的soft文件,按列取子集
# 方法3 官网下载
# 方法4 自主注释
save(exp,group_list,ids,file = "step2output.Rdata")
制作PCA图和热图
这一步只要提供exp
和grou_list
其他不用变
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
boxplot(exp,outline=FALSE,notch=T,las=2)
#输入数据:exp和group_list
#Principal Component Analysis
#http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials
dat=as.data.frame(t(exp))
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra)
# pca的统一操作走起
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")
#热图
cg=names(tail(sort(apply(exp,1,sd)),1000))
n=exp[cg,]
#绘制热图
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(pheatmap)
pheatmap1 <- pheatmap(n,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col,
scale = "row")
ggsave(plot = pheatmap1,filename = paste0(gse,"pheatmap1.png"))
做差异分析
利用贝叶斯
获取表达差异矩阵,用inner_join
获得探针信息,获得deg
表达。
rm(list = ls())
load(file = "step2output.Rdata")
#差异分析,用limma包来做
#需要表达矩阵和group_list,不需要改
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)
#为deg数据框添加几列
#1.加probe_id列,把行名变成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg))
head(deg)
#2.加symbol列,火山图要用
deg <- inner_join(deg,ids,by="probe_id")
head(deg)
deg <- deg[!duplicated(deg$symbol),]
#3.加change列,标记上下调基因
logFC_t=1
P.Value_t = 0.05
k1 = (deg$adj.P.Val < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$adj.P.Val < P.Value_t)&(deg$logFC > logFC_t)
table(k1)
table(k2)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change)
火山图
提供了差异前几基因
或者某一单独基因
的实现方法
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
#1.火山图----
library(dplyr)
library(ggplot2)
dat = deg
if(T) {
x1 = dat %>%
filter(change == "up") %>%
arrange(desc(logFC))%>%
head(3)
x2 = dat %>%
filter(change == "down") %>%
arrange(logFC)%>%
head(3)
for_label = rbind(x1,x2)
}
#%>%
if(F){
for_label <- dat%>%
filter(symbol %in% c("VIP","SFRP1"))
}
if(F){
for_label <- dat %>% head(10)
}
if(F) {
x1 = dat %>%
filter(change == "up") %>%
head(3)
x2 = dat %>%
filter(change == "down") %>%
head(3)
for_label = rbind(x1,x2)
}
# head(10)
p <- ggplot(data = dat,
aes(x = logFC,
y = -log10(adj.P.Val))) +
geom_point(alpha=0.4, size=3.5,
aes(color=change)) +
ylab("-log10(Pvalue)")+
scale_color_manual(values=c("blue", "grey","red"))+
geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
theme_bw()+
theme(panel.border = element_blank(),panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),axis.line = element_line(colour = "black"))
p
volcano_plot <- p +
geom_point(size = 3, shape = 1, data = for_label) +
ggrepel::geom_label_repel(
aes(label = symbol),
data = for_label,
color="black")
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))
image.png
热图和拼图
热图提供前多少
基因或者所有
差异基因的热图,if函数提供了标记列名
的函数,最后用patchwork
进行拼图
load(file = 'step2output.Rdata')
dd1=deg
x=dd1$logFC
names(x)=dd1$probe_id
cg=c(names(head(sort(x),30)),names(tail(sort(x),30)))
cg
#cg = deg$probe_id[deg$change !="stable"]
rownames(dd1) <- dd1$probe_id
dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
dd2
if(F) {
rownames(dd1) <- dd1$probe_id
dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
dd2
}
n=exp[cg,]
#rownames(n) <- dd2
dim(n)
#作热图
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
show_rownames = T,
scale = "row",
#cluster_cols = F,
annotation_col=annotation_col
))
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))
load("pca_plot.Rdata")
library(patchwork)
(pca_plot | volcano_plot )/heatmap_plot+ plot_annotation(tag_levels = "A")
Cggsave(filename = "test.png",width = 15,height = 5)
image.png
网友评论