美文网首页
差异分析16-1

差异分析16-1

作者: 小胡同学ime | 来源:发表于2021-12-20 18:43 被阅读0次
image.png

rm(list = ls())
load(file = "step2output.Rdata")

差异分析,用limma包来做

需要表达矩阵和Group,不需要改(固定四步操作)

library(limma)
design=model.matrix(~Group)  #group生成分组向量,对照组在前
fit=lmFit(exp,design)  #表达矩阵,group生成矩阵
fit=eBayes(fit)    #线性离合
deg=topTable(fit,coef=2,number = Inf)   #生成deg的数据框
deg
> dim(deg)  #虽然行名一致,但是顺序不一致,原因是deg在读取的时候按p值大小已经重新排列了顺序
[1] 54675     6
> dim(exp)
[1] 54675    22
> identical(rownames(deg),rownames(exp))  #若完全一致返回true
[1] FALSE
image.png

为deg数据框添加几列

#1.加probe_id列(探针id),本来是deg数据框的行名,为了好操作变成数据框新的一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg))  #或者deg$probe_id=rownames(deg)
head(deg)

#2.加上探针注释 (ids是我们前面从r包或者官网得到的关于探针的注释信息)
table(!duplicated(ids$probe_id))
table(!duplicated(ids$symbol))

> table(!duplicated(ids$probe_id))
TRUE    #没有重复,41905个取值
41905 
> table(!duplicated(ids$symbol)) 
FALSE  TRUE  #有重复其中21744个symble(基因)重复
21744 20161 
#按symbol列去重(不同探针对应相同基因,所以基因重复多次而探针不重复)常见标准有3个:最大值/平均值/随机去重
#随机去重,另两个见zz.去重方式.R
ids = ids[!duplicated(ids$symbol),]  #只保留第一次出现的值
deg <- inner_join(deg,ids,by="probe_id")  #取交集,不存在的就被去除
head(deg)
nrow(deg)

#3.加change列,标记上下调基因
logFC_t=1  #可更改
P.Value_t = 0.01   #可更改/0.05
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
change = ifelse(k1,
                "down",
                ifelse(k2,"up","stable"))
deg <- mutate(deg,change)

#4.加ENTREZID列,用于富集分析(symbol转entrezid,然后inner_join)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol, 
            fromType = "SYMBOL",
            toType = "ENTREZID",
            OrgDb = org.Hs.eg.db)#人类
#其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
dim(deg)
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL")) #将两个表格通过相同点symbol连接起来
dim(deg)
length(unique(deg$symbol))  #合并后的行数增加了是因为symbol与entrezid不是一一对应的关系,属正常
save(Group,deg,logFC_t,P.Value_t,gse_number,file = "step4output.Rdata")

出图

rm(list = ls()) 
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")

31.火山图----

library(dplyr)
library(ggplot2)
dat  = deg

p <- ggplot(data = dat, 
            aes(x = logFC, 
                y = -log10(P.Value))) +
  geom_point(alpha=0.4, size=3.5, 
             aes(color=change)) +
  ylab("-log10(Pvalue)")+
  scale_color_manual(values=c("blue", "grey","red"))+  #手动设置颜色(up、down、stable)
  geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +  #横线
  geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +  #竖线
  theme_bw()   #挑颜色
p

标记我想要标记的基因名

if(T){
  #自选基因
  for_label <- dat%>% 
    filter(symbol %in% c("HADHA","LRRFIP1")) 
}
if(F){
  #p值最小的10个
  for_label <- dat %>% head(10)
}
if(F) {
  #p值最小的前3下调和前3上调
  x1 = dat %>% 
    filter(change == "up") %>% 
    head(3)
  x2 = dat %>% 
    filter(change == "down") %>% 
    head(3)
  for_label = rbind(x1,x2)
}
#这三个方法都是为了生成我作图需要的数据 for_label,也就是我想标记基因的相关数据

volcano_plot <- p +
  geom_point(size = 3, shape = 1, data = for_label) +
  ggrepel::geom_label_repel(
    aes(label = symbol),   #标记我所需要的symble
    data = for_label,
    color="black"
  )
volcano_plot
ggsave(plot = volcano_plot,filename = paste0(gse_number,"_volcano.png"))

2.差异基因热图----

load(file = 'step2output.Rdata')
if(T){
  #全部差异基因
  cg = deg$probe_id[deg$change !="stable"]
  length(cg)
}else{

  #取前30上调和前30下调
  x=deg$logFC[deg$change !="stable"] 
 names(x)=deg$probe_id[deg$change !="stable"] 
  cg=names(c(head(sort(x),30),tail(sort(x),30)))
  length(cg)
}
n=exp[cg,]
dim(n)

#差异基因热图
library(pheatmap)
annotation_col=data.frame(group=Group)
rownames(annotation_col)=colnames(n) 
heatmap_plot <- pheatmap(n,show_colnames =F,
                         show_rownames = F,
                         scale = "row",
                         #cluster_cols = F, 
                         annotation_col=annotation_col,
                         breaks = seq(-3,3,length.out = 100)
) 
heatmap_plot

#存图拼图
ggsave(heatmap_plot,filename = paste0(gse_number,"_heatmap.png"))
load("pca_plot.Rdata")
library(patchwork)
library(ggplotify)
(pca_plot + volcano_plot +as.ggplot(heatmap_plot))  #做拼图
image.png

1.GO 富集分析----

rm(list = ls())  
load(file = 'step4output.Rdata')
library(clusterProfiler)
library(dplyr)
library(ggplot2)
library(stringr)
library(enrichplot)

(1)输入数据

gene_up = deg[deg$change == 'up','ENTREZID'] 
gene_down = deg[deg$change == 'down','ENTREZID'] 
gene_diff = c(gene_up,gene_down)
gene_all = deg[,'ENTREZID']

(2)富集

#以下步骤耗时很长,设置了存在即跳过
if(!file.exists(paste0(gse_number,"_GO.Rdata"))){
  ego <- enrichGO(gene = gene_diff,
                  OrgDb= org.Hs.eg.db,
                  ont = "ALL",  #参数为加上下满三个共四个
                  readable = TRUE)
  #ont参数:One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
  save(ego,file = paste0(gse_number,"_GO.Rdata"))
}
load(paste0(gse_number,"_GO.Rdata"))
dev.off()

(3)可视化

#条带图
barplot(ego)
#气泡图
dotplot(ego)

dotplot(ego, split = "ONTOLOGY", font.size = 10, 
        showCategory = 5) + facet_grid(ONTOLOGY ~ ., scale = "free") + 
  scale_y_discrete(labels = function(x) str_wrap(x, width = 45))

#geneList 用于设置下面图的颜色
geneList = deg$logFC
names(geneList)=deg$ENTREZID
geneList = sort(geneList,decreasing = T)

#(3)展示top通路的共同基因,要放大看。
#Gene-Concept Network
cnetplot(ego,categorySize="pvalue", foldChange=geneList,colorEdge = TRUE)
cnetplot(ego, showCategory = 3,foldChange=geneList, circular = TRUE, colorEdge = TRUE)
#Enrichment Map,这个函数最近更新过,版本不同代码会不同

Biobase::package.version("enrichplot")

if(F){
  emapplot(pairwise_termsim(ego)) #新版本
}else{
  #emapplot(ego)#老版本
}

#(4)展示通路关系 https://zhuanlan.zhihu.com/p/99789859
#goplot(ego)

#(5)Heatmap-like functional classification
heatplot(ego,foldChange = geneList,showCategory = 8)

2.KEGG pathway analysis----

#上调、下调、差异、所有基因
#(1)输入数据
gene_up = deg[deg$change == 'up','ENTREZID'] 
gene_down = deg[deg$change == 'down','ENTREZID'] 
gene_diff = c(gene_up,gene_down)
gene_all = deg[,'ENTREZID']

#(2)对上调/下调/所有差异基因进行富集分析
if(!file.exists(paste0(gse_number,"_KEGG.Rdata"))){
  kk.up <- enrichKEGG(gene         = gene_up,
                      organism     = 'hsa')
  kk.down <- enrichKEGG(gene         =  gene_down,
                        organism     = 'hsa')
  kk.diff <- enrichKEGG(gene         = gene_diff,
                        organism     = 'hsa')
  save(kk.diff,kk.down,kk.up,file = paste0(gse_number,"_KEGG.Rdata"))
}
load(paste0(gse_number,"_KEGG.Rdata"))

#(3)看看富集到了吗?https://mp.weixin.qq.com/s/NglawJgVgrMJ0QfD-YRBQg
table(kk.diff@result$p.adjust<0.05)
table(kk.up@result$p.adjust<0.05)
table(kk.down@result$p.adjust<0.05)

#(4)按照pvalue筛选通路
down_kegg <- kk.down@result %>%
  filter(pvalue<0.05) %>% #筛选行
  mutate(group=-1) #新增列

up_kegg <- kk.up@result %>%
  filter(pvalue<0.05) %>%
  mutate(group=1)

#(5)可视化
source("kegg_plot_function.R")
g_kegg <- kegg_plot(up_kegg,down_kegg)
g_kegg
#g_kegg +scale_y_continuous(labels = c(4,2,0,2,4)) #涂上横坐标取log后是没有负数的,需要自己设置一下坐标,根据前面出的数据将负的改为正数
ggsave(g_kegg,filename = 'kegg_up_down.png')

3.gsea作kegg和GO富集分析----
https://www.jianshu.com/p/c5b7b7dbf29b

#(1)查看示例数据
data(geneList, package="DOSE")
#(2)将我们的数据转换成示例数据的格式
geneList=deg$logFC
names(geneList)=deg$ENTREZID
geneList=sort(geneList,decreasing = T)
#(3)富集分析
kk_gse <- gseKEGG(geneList     = geneList,
                  organism     = 'hsa',
                  verbose      = FALSE)
down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
#(4)可视化
g2 = kegg_plot(up_kegg,down_kegg)
g2

4.能看懂的资料越来越多----
GSEA学习更多:https://www.jianshu.com/p/baf85b51752e
富集分析学习更多:http://yulab-smu.top/clusterProfiler-book/index.html
弦图:https://www.jianshu.com/p/e4bb41865b7f
GOplot:https://mp.weixin.qq.com/s/LonwdDhDn8iFUfxqSJ2Wew
网上的资料和宝藏无穷无尽,学好R语言慢慢发掘~

AnnoProbe包拓展运用:三种函数

library(AnnoProbe)
#获取数据
?geoChina
geoChina("GSE1009")
load('GSE1009_eSet.Rdata')
class(gset)
length(gset)
class(gset[[1]])

geoChina("GSE1009")
load('GSE1009_eSet.Rdata')
eSet=gset[[1]]

#查看探针与基因名的关系
ids=idmap("GPL570")
head(ids)
> head(ids)
        probe_id symbol
193731   1053_at   RFC2
193732    117_at  HSPA6
193733    121_at   PAX8
193734 1255_g_at GUCA1A
193735   1316_at   THRA
193736   1320_at PTPN21

#告诉我们是什么样的基因,存在染色体位置等
?annoGene()
IDs <- c("DDX11L1", "MIR6859-1", "OR4G4P", "OR4F5")
ID_type = "SYMBOL"
annoGene(IDs, ID_type)
annoGene(IDs, ID_type,out_file ='tmp.html')
annoGene(IDs, ID_type,out_file ='tmp.csv')

> annoGene(IDs, ID_type,out_file ='tmp.csv')
SYMBOL                           biotypes         ENSEMBL  chr start   end
1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869 14409
5  OR4G4P             unprocessed_pseudogene ENSG00000268020 chr1 52473 53312
7   OR4F5                     protein_coding ENSG00000186092 chr1 65419 71585

小洁老师自制r包,简化全过程16-3 12:00

devtools::install_github("xjsun1221/tinyarray")

library(tinyarray)
geo = geo_download("GSE56649")  #下载数据

View(geo$pd)
pd = geo$pd
library(stringr)
Group=ifelse(str_detect(pd$source_name_ch1,"control"),
             "control",
             "RA")  #规定分组
#设置参考水平,指定levels,对照组在前,处理组在后
Group = factor(Group,
               levels = c("control","RA"))
Group

find_anno(geo$gpl,install = T)

ids <- toTable(hgu133plus2SYMBOL)

geo$exp = log2(geo$exp+1)
deg = get_deg_all(geo$exp,Group,ids)
deg$plots #出图
image.png

相关文章

网友评论

      本文标题:差异分析16-1

      本文链接:https://www.haomeiwen.com/subject/rdzvpltx.html