rm(list = ls())
load(file = "step2output.Rdata")
差异分析,用limma包来做
需要表达矩阵和Group,不需要改(固定四步操作)
library(limma)
design=model.matrix(~Group) #group生成分组向量,对照组在前
fit=lmFit(exp,design) #表达矩阵,group生成矩阵
fit=eBayes(fit) #线性离合
deg=topTable(fit,coef=2,number = Inf) #生成deg的数据框
deg
> dim(deg) #虽然行名一致,但是顺序不一致,原因是deg在读取的时候按p值大小已经重新排列了顺序
[1] 54675 6
> dim(exp)
[1] 54675 22
> identical(rownames(deg),rownames(exp)) #若完全一致返回true
[1] FALSE
image.png
为deg数据框添加几列
#1.加probe_id列(探针id),本来是deg数据框的行名,为了好操作变成数据框新的一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg)) #或者deg$probe_id=rownames(deg)
head(deg)
#2.加上探针注释 (ids是我们前面从r包或者官网得到的关于探针的注释信息)
table(!duplicated(ids$probe_id))
table(!duplicated(ids$symbol))
> table(!duplicated(ids$probe_id))
TRUE #没有重复,41905个取值
41905
> table(!duplicated(ids$symbol))
FALSE TRUE #有重复其中21744个symble(基因)重复
21744 20161
#按symbol列去重(不同探针对应相同基因,所以基因重复多次而探针不重复)常见标准有3个:最大值/平均值/随机去重
#随机去重,另两个见zz.去重方式.R
ids = ids[!duplicated(ids$symbol),] #只保留第一次出现的值
deg <- inner_join(deg,ids,by="probe_id") #取交集,不存在的就被去除
head(deg)
nrow(deg)
#3.加change列,标记上下调基因
logFC_t=1 #可更改
P.Value_t = 0.01 #可更改/0.05
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
change = ifelse(k1,
"down",
ifelse(k2,"up","stable"))
deg <- mutate(deg,change)
#4.加ENTREZID列,用于富集分析(symbol转entrezid,然后inner_join)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)#人类
#其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
dim(deg)
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL")) #将两个表格通过相同点symbol连接起来
dim(deg)
length(unique(deg$symbol)) #合并后的行数增加了是因为symbol与entrezid不是一一对应的关系,属正常
save(Group,deg,logFC_t,P.Value_t,gse_number,file = "step4output.Rdata")
出图
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
31.火山图----
library(dplyr)
library(ggplot2)
dat = deg
p <- ggplot(data = dat,
aes(x = logFC,
y = -log10(P.Value))) +
geom_point(alpha=0.4, size=3.5,
aes(color=change)) +
ylab("-log10(Pvalue)")+
scale_color_manual(values=c("blue", "grey","red"))+ #手动设置颜色(up、down、stable)
geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) + #横线
geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) + #竖线
theme_bw() #挑颜色
p
标记我想要标记的基因名
if(T){
#自选基因
for_label <- dat%>%
filter(symbol %in% c("HADHA","LRRFIP1"))
}
if(F){
#p值最小的10个
for_label <- dat %>% head(10)
}
if(F) {
#p值最小的前3下调和前3上调
x1 = dat %>%
filter(change == "up") %>%
head(3)
x2 = dat %>%
filter(change == "down") %>%
head(3)
for_label = rbind(x1,x2)
}
#这三个方法都是为了生成我作图需要的数据 for_label,也就是我想标记基因的相关数据
volcano_plot <- p +
geom_point(size = 3, shape = 1, data = for_label) +
ggrepel::geom_label_repel(
aes(label = symbol), #标记我所需要的symble
data = for_label,
color="black"
)
volcano_plot
ggsave(plot = volcano_plot,filename = paste0(gse_number,"_volcano.png"))
2.差异基因热图----
load(file = 'step2output.Rdata')
if(T){
#全部差异基因
cg = deg$probe_id[deg$change !="stable"]
length(cg)
}else{
#取前30上调和前30下调
x=deg$logFC[deg$change !="stable"]
names(x)=deg$probe_id[deg$change !="stable"]
cg=names(c(head(sort(x),30),tail(sort(x),30)))
length(cg)
}
n=exp[cg,]
dim(n)
#差异基因热图
library(pheatmap)
annotation_col=data.frame(group=Group)
rownames(annotation_col)=colnames(n)
heatmap_plot <- pheatmap(n,show_colnames =F,
show_rownames = F,
scale = "row",
#cluster_cols = F,
annotation_col=annotation_col,
breaks = seq(-3,3,length.out = 100)
)
heatmap_plot
#存图拼图
ggsave(heatmap_plot,filename = paste0(gse_number,"_heatmap.png"))
load("pca_plot.Rdata")
library(patchwork)
library(ggplotify)
(pca_plot + volcano_plot +as.ggplot(heatmap_plot)) #做拼图
image.png
1.GO 富集分析----
rm(list = ls())
load(file = 'step4output.Rdata')
library(clusterProfiler)
library(dplyr)
library(ggplot2)
library(stringr)
library(enrichplot)
(1)输入数据
gene_up = deg[deg$change == 'up','ENTREZID']
gene_down = deg[deg$change == 'down','ENTREZID']
gene_diff = c(gene_up,gene_down)
gene_all = deg[,'ENTREZID']
(2)富集
#以下步骤耗时很长,设置了存在即跳过
if(!file.exists(paste0(gse_number,"_GO.Rdata"))){
ego <- enrichGO(gene = gene_diff,
OrgDb= org.Hs.eg.db,
ont = "ALL", #参数为加上下满三个共四个
readable = TRUE)
#ont参数:One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
save(ego,file = paste0(gse_number,"_GO.Rdata"))
}
load(paste0(gse_number,"_GO.Rdata"))
dev.off()
(3)可视化
#条带图
barplot(ego)
#气泡图
dotplot(ego)
dotplot(ego, split = "ONTOLOGY", font.size = 10,
showCategory = 5) + facet_grid(ONTOLOGY ~ ., scale = "free") +
scale_y_discrete(labels = function(x) str_wrap(x, width = 45))
#geneList 用于设置下面图的颜色
geneList = deg$logFC
names(geneList)=deg$ENTREZID
geneList = sort(geneList,decreasing = T)
#(3)展示top通路的共同基因,要放大看。
#Gene-Concept Network
cnetplot(ego,categorySize="pvalue", foldChange=geneList,colorEdge = TRUE)
cnetplot(ego, showCategory = 3,foldChange=geneList, circular = TRUE, colorEdge = TRUE)
#Enrichment Map,这个函数最近更新过,版本不同代码会不同
Biobase::package.version("enrichplot")
if(F){
emapplot(pairwise_termsim(ego)) #新版本
}else{
#emapplot(ego)#老版本
}
#(4)展示通路关系 https://zhuanlan.zhihu.com/p/99789859
#goplot(ego)
#(5)Heatmap-like functional classification
heatplot(ego,foldChange = geneList,showCategory = 8)
2.KEGG pathway analysis----
#上调、下调、差异、所有基因
#(1)输入数据
gene_up = deg[deg$change == 'up','ENTREZID']
gene_down = deg[deg$change == 'down','ENTREZID']
gene_diff = c(gene_up,gene_down)
gene_all = deg[,'ENTREZID']
#(2)对上调/下调/所有差异基因进行富集分析
if(!file.exists(paste0(gse_number,"_KEGG.Rdata"))){
kk.up <- enrichKEGG(gene = gene_up,
organism = 'hsa')
kk.down <- enrichKEGG(gene = gene_down,
organism = 'hsa')
kk.diff <- enrichKEGG(gene = gene_diff,
organism = 'hsa')
save(kk.diff,kk.down,kk.up,file = paste0(gse_number,"_KEGG.Rdata"))
}
load(paste0(gse_number,"_KEGG.Rdata"))
#(3)看看富集到了吗?https://mp.weixin.qq.com/s/NglawJgVgrMJ0QfD-YRBQg
table(kk.diff@result$p.adjust<0.05)
table(kk.up@result$p.adjust<0.05)
table(kk.down@result$p.adjust<0.05)
#(4)按照pvalue筛选通路
down_kegg <- kk.down@result %>%
filter(pvalue<0.05) %>% #筛选行
mutate(group=-1) #新增列
up_kegg <- kk.up@result %>%
filter(pvalue<0.05) %>%
mutate(group=1)
#(5)可视化
source("kegg_plot_function.R")
g_kegg <- kegg_plot(up_kegg,down_kegg)
g_kegg
#g_kegg +scale_y_continuous(labels = c(4,2,0,2,4)) #涂上横坐标取log后是没有负数的,需要自己设置一下坐标,根据前面出的数据将负的改为正数
ggsave(g_kegg,filename = 'kegg_up_down.png')
3.gsea作kegg和GO富集分析----
https://www.jianshu.com/p/c5b7b7dbf29b
#(1)查看示例数据
data(geneList, package="DOSE")
#(2)将我们的数据转换成示例数据的格式
geneList=deg$logFC
names(geneList)=deg$ENTREZID
geneList=sort(geneList,decreasing = T)
#(3)富集分析
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
verbose = FALSE)
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
#(4)可视化
g2 = kegg_plot(up_kegg,down_kegg)
g2
4.能看懂的资料越来越多----
GSEA学习更多:https://www.jianshu.com/p/baf85b51752e
富集分析学习更多:http://yulab-smu.top/clusterProfiler-book/index.html
弦图:https://www.jianshu.com/p/e4bb41865b7f
GOplot:https://mp.weixin.qq.com/s/LonwdDhDn8iFUfxqSJ2Wew
网上的资料和宝藏无穷无尽,学好R语言慢慢发掘~
AnnoProbe包拓展运用:三种函数
library(AnnoProbe)
#获取数据
?geoChina
geoChina("GSE1009")
load('GSE1009_eSet.Rdata')
class(gset)
length(gset)
class(gset[[1]])
geoChina("GSE1009")
load('GSE1009_eSet.Rdata')
eSet=gset[[1]]
#查看探针与基因名的关系
ids=idmap("GPL570")
head(ids)
> head(ids)
probe_id symbol
193731 1053_at RFC2
193732 117_at HSPA6
193733 121_at PAX8
193734 1255_g_at GUCA1A
193735 1316_at THRA
193736 1320_at PTPN21
#告诉我们是什么样的基因,存在染色体位置等
?annoGene()
IDs <- c("DDX11L1", "MIR6859-1", "OR4G4P", "OR4F5")
ID_type = "SYMBOL"
annoGene(IDs, ID_type)
annoGene(IDs, ID_type,out_file ='tmp.html')
annoGene(IDs, ID_type,out_file ='tmp.csv')
> annoGene(IDs, ID_type,out_file ='tmp.csv')
SYMBOL biotypes ENSEMBL chr start end
1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869 14409
5 OR4G4P unprocessed_pseudogene ENSG00000268020 chr1 52473 53312
7 OR4F5 protein_coding ENSG00000186092 chr1 65419 71585
小洁老师自制r包,简化全过程16-3 12:00
devtools::install_github("xjsun1221/tinyarray")
library(tinyarray)
geo = geo_download("GSE56649") #下载数据
View(geo$pd)
pd = geo$pd
library(stringr)
Group=ifelse(str_detect(pd$source_name_ch1,"control"),
"control",
"RA") #规定分组
#设置参考水平,指定levels,对照组在前,处理组在后
Group = factor(Group,
levels = c("control","RA"))
Group
find_anno(geo$gpl,install = T)
ids <- toTable(hgu133plus2SYMBOL)
geo$exp = log2(geo$exp+1)
deg = get_deg_all(geo$exp,Group,ids)
deg$plots #出图
image.png
网友评论