美文网首页
差异分析16-1

差异分析16-1

作者: 小胡同学ime | 来源:发表于2021-12-20 18:43 被阅读0次
    image.png

    rm(list = ls())
    load(file = "step2output.Rdata")

    差异分析,用limma包来做

    需要表达矩阵和Group,不需要改(固定四步操作)

    library(limma)
    design=model.matrix(~Group)  #group生成分组向量,对照组在前
    fit=lmFit(exp,design)  #表达矩阵,group生成矩阵
    fit=eBayes(fit)    #线性离合
    deg=topTable(fit,coef=2,number = Inf)   #生成deg的数据框
    deg
    
    > dim(deg)  #虽然行名一致,但是顺序不一致,原因是deg在读取的时候按p值大小已经重新排列了顺序
    [1] 54675     6
    > dim(exp)
    [1] 54675    22
    > identical(rownames(deg),rownames(exp))  #若完全一致返回true
    [1] FALSE
    
    image.png

    为deg数据框添加几列

    #1.加probe_id列(探针id),本来是deg数据框的行名,为了好操作变成数据框新的一列
    library(dplyr)
    deg <- mutate(deg,probe_id=rownames(deg))  #或者deg$probe_id=rownames(deg)
    head(deg)
    
    #2.加上探针注释 (ids是我们前面从r包或者官网得到的关于探针的注释信息)
    table(!duplicated(ids$probe_id))
    table(!duplicated(ids$symbol))
    
    > table(!duplicated(ids$probe_id))
    TRUE    #没有重复,41905个取值
    41905 
    > table(!duplicated(ids$symbol)) 
    FALSE  TRUE  #有重复其中21744个symble(基因)重复
    21744 20161 
    #按symbol列去重(不同探针对应相同基因,所以基因重复多次而探针不重复)常见标准有3个:最大值/平均值/随机去重
    #随机去重,另两个见zz.去重方式.R
    ids = ids[!duplicated(ids$symbol),]  #只保留第一次出现的值
    deg <- inner_join(deg,ids,by="probe_id")  #取交集,不存在的就被去除
    head(deg)
    nrow(deg)
    
    #3.加change列,标记上下调基因
    logFC_t=1  #可更改
    P.Value_t = 0.01   #可更改/0.05
    k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
    k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
    change = ifelse(k1,
                    "down",
                    ifelse(k2,"up","stable"))
    deg <- mutate(deg,change)
    
    #4.加ENTREZID列,用于富集分析(symbol转entrezid,然后inner_join)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    s2e <- bitr(deg$symbol, 
                fromType = "SYMBOL",
                toType = "ENTREZID",
                OrgDb = org.Hs.eg.db)#人类
    #其他物种http://bioconductor.org/packages/release/BiocViews.html#___OrgDb
    dim(deg)
    deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL")) #将两个表格通过相同点symbol连接起来
    dim(deg)
    length(unique(deg$symbol))  #合并后的行数增加了是因为symbol与entrezid不是一一对应的关系,属正常
    save(Group,deg,logFC_t,P.Value_t,gse_number,file = "step4output.Rdata")
    

    出图

    rm(list = ls()) 
    load(file = "step1output.Rdata")
    load(file = "step4output.Rdata")
    

    31.火山图----

    library(dplyr)
    library(ggplot2)
    dat  = deg
    
    p <- ggplot(data = dat, 
                aes(x = logFC, 
                    y = -log10(P.Value))) +
      geom_point(alpha=0.4, size=3.5, 
                 aes(color=change)) +
      ylab("-log10(Pvalue)")+
      scale_color_manual(values=c("blue", "grey","red"))+  #手动设置颜色(up、down、stable)
      geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +  #横线
      geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +  #竖线
      theme_bw()   #挑颜色
    p
    

    标记我想要标记的基因名

    if(T){
      #自选基因
      for_label <- dat%>% 
        filter(symbol %in% c("HADHA","LRRFIP1")) 
    }
    if(F){
      #p值最小的10个
      for_label <- dat %>% head(10)
    }
    if(F) {
      #p值最小的前3下调和前3上调
      x1 = dat %>% 
        filter(change == "up") %>% 
        head(3)
      x2 = dat %>% 
        filter(change == "down") %>% 
        head(3)
      for_label = rbind(x1,x2)
    }
    #这三个方法都是为了生成我作图需要的数据 for_label,也就是我想标记基因的相关数据
    
    volcano_plot <- p +
      geom_point(size = 3, shape = 1, data = for_label) +
      ggrepel::geom_label_repel(
        aes(label = symbol),   #标记我所需要的symble
        data = for_label,
        color="black"
      )
    volcano_plot
    ggsave(plot = volcano_plot,filename = paste0(gse_number,"_volcano.png"))
    

    2.差异基因热图----

    load(file = 'step2output.Rdata')
    if(T){
      #全部差异基因
      cg = deg$probe_id[deg$change !="stable"]
      length(cg)
    }else{
    
      #取前30上调和前30下调
      x=deg$logFC[deg$change !="stable"] 
     names(x)=deg$probe_id[deg$change !="stable"] 
      cg=names(c(head(sort(x),30),tail(sort(x),30)))
      length(cg)
    }
    n=exp[cg,]
    dim(n)
    
    #差异基因热图
    library(pheatmap)
    annotation_col=data.frame(group=Group)
    rownames(annotation_col)=colnames(n) 
    heatmap_plot <- pheatmap(n,show_colnames =F,
                             show_rownames = F,
                             scale = "row",
                             #cluster_cols = F, 
                             annotation_col=annotation_col,
                             breaks = seq(-3,3,length.out = 100)
    ) 
    heatmap_plot
    
    #存图拼图
    ggsave(heatmap_plot,filename = paste0(gse_number,"_heatmap.png"))
    load("pca_plot.Rdata")
    library(patchwork)
    library(ggplotify)
    (pca_plot + volcano_plot +as.ggplot(heatmap_plot))  #做拼图
    
    image.png

    1.GO 富集分析----

    rm(list = ls())  
    load(file = 'step4output.Rdata')
    library(clusterProfiler)
    library(dplyr)
    library(ggplot2)
    library(stringr)
    library(enrichplot)
    

    (1)输入数据

    gene_up = deg[deg$change == 'up','ENTREZID'] 
    gene_down = deg[deg$change == 'down','ENTREZID'] 
    gene_diff = c(gene_up,gene_down)
    gene_all = deg[,'ENTREZID']
    

    (2)富集

    #以下步骤耗时很长,设置了存在即跳过
    if(!file.exists(paste0(gse_number,"_GO.Rdata"))){
      ego <- enrichGO(gene = gene_diff,
                      OrgDb= org.Hs.eg.db,
                      ont = "ALL",  #参数为加上下满三个共四个
                      readable = TRUE)
      #ont参数:One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
      save(ego,file = paste0(gse_number,"_GO.Rdata"))
    }
    load(paste0(gse_number,"_GO.Rdata"))
    dev.off()
    

    (3)可视化

    #条带图
    barplot(ego)
    #气泡图
    dotplot(ego)
    
    dotplot(ego, split = "ONTOLOGY", font.size = 10, 
            showCategory = 5) + facet_grid(ONTOLOGY ~ ., scale = "free") + 
      scale_y_discrete(labels = function(x) str_wrap(x, width = 45))
    
    #geneList 用于设置下面图的颜色
    geneList = deg$logFC
    names(geneList)=deg$ENTREZID
    geneList = sort(geneList,decreasing = T)
    
    #(3)展示top通路的共同基因,要放大看。
    #Gene-Concept Network
    cnetplot(ego,categorySize="pvalue", foldChange=geneList,colorEdge = TRUE)
    cnetplot(ego, showCategory = 3,foldChange=geneList, circular = TRUE, colorEdge = TRUE)
    #Enrichment Map,这个函数最近更新过,版本不同代码会不同
    
    Biobase::package.version("enrichplot")
    
    if(F){
      emapplot(pairwise_termsim(ego)) #新版本
    }else{
      #emapplot(ego)#老版本
    }
    
    #(4)展示通路关系 https://zhuanlan.zhihu.com/p/99789859
    #goplot(ego)
    
    #(5)Heatmap-like functional classification
    heatplot(ego,foldChange = geneList,showCategory = 8)
    

    2.KEGG pathway analysis----

    #上调、下调、差异、所有基因
    #(1)输入数据
    gene_up = deg[deg$change == 'up','ENTREZID'] 
    gene_down = deg[deg$change == 'down','ENTREZID'] 
    gene_diff = c(gene_up,gene_down)
    gene_all = deg[,'ENTREZID']
    
    #(2)对上调/下调/所有差异基因进行富集分析
    if(!file.exists(paste0(gse_number,"_KEGG.Rdata"))){
      kk.up <- enrichKEGG(gene         = gene_up,
                          organism     = 'hsa')
      kk.down <- enrichKEGG(gene         =  gene_down,
                            organism     = 'hsa')
      kk.diff <- enrichKEGG(gene         = gene_diff,
                            organism     = 'hsa')
      save(kk.diff,kk.down,kk.up,file = paste0(gse_number,"_KEGG.Rdata"))
    }
    load(paste0(gse_number,"_KEGG.Rdata"))
    
    #(3)看看富集到了吗?https://mp.weixin.qq.com/s/NglawJgVgrMJ0QfD-YRBQg
    table(kk.diff@result$p.adjust<0.05)
    table(kk.up@result$p.adjust<0.05)
    table(kk.down@result$p.adjust<0.05)
    
    #(4)按照pvalue筛选通路
    down_kegg <- kk.down@result %>%
      filter(pvalue<0.05) %>% #筛选行
      mutate(group=-1) #新增列
    
    up_kegg <- kk.up@result %>%
      filter(pvalue<0.05) %>%
      mutate(group=1)
    
    #(5)可视化
    source("kegg_plot_function.R")
    g_kegg <- kegg_plot(up_kegg,down_kegg)
    g_kegg
    #g_kegg +scale_y_continuous(labels = c(4,2,0,2,4)) #涂上横坐标取log后是没有负数的,需要自己设置一下坐标,根据前面出的数据将负的改为正数
    ggsave(g_kegg,filename = 'kegg_up_down.png')
    

    3.gsea作kegg和GO富集分析----
    https://www.jianshu.com/p/c5b7b7dbf29b

    #(1)查看示例数据
    data(geneList, package="DOSE")
    #(2)将我们的数据转换成示例数据的格式
    geneList=deg$logFC
    names(geneList)=deg$ENTREZID
    geneList=sort(geneList,decreasing = T)
    #(3)富集分析
    kk_gse <- gseKEGG(geneList     = geneList,
                      organism     = 'hsa',
                      verbose      = FALSE)
    down_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore < 0,];down_kegg$group=-1
    up_kegg<-kk_gse[kk_gse$pvalue<0.05 & kk_gse$enrichmentScore > 0,];up_kegg$group=1
    #(4)可视化
    g2 = kegg_plot(up_kegg,down_kegg)
    g2
    

    4.能看懂的资料越来越多----
    GSEA学习更多:https://www.jianshu.com/p/baf85b51752e
    富集分析学习更多:http://yulab-smu.top/clusterProfiler-book/index.html
    弦图:https://www.jianshu.com/p/e4bb41865b7f
    GOplot:https://mp.weixin.qq.com/s/LonwdDhDn8iFUfxqSJ2Wew
    网上的资料和宝藏无穷无尽,学好R语言慢慢发掘~

    AnnoProbe包拓展运用:三种函数

    library(AnnoProbe)
    #获取数据
    ?geoChina
    geoChina("GSE1009")
    load('GSE1009_eSet.Rdata')
    class(gset)
    length(gset)
    class(gset[[1]])
    
    geoChina("GSE1009")
    load('GSE1009_eSet.Rdata')
    eSet=gset[[1]]
    
    #查看探针与基因名的关系
    ids=idmap("GPL570")
    head(ids)
    > head(ids)
            probe_id symbol
    193731   1053_at   RFC2
    193732    117_at  HSPA6
    193733    121_at   PAX8
    193734 1255_g_at GUCA1A
    193735   1316_at   THRA
    193736   1320_at PTPN21
    
    #告诉我们是什么样的基因,存在染色体位置等
    ?annoGene()
    IDs <- c("DDX11L1", "MIR6859-1", "OR4G4P", "OR4F5")
    ID_type = "SYMBOL"
    annoGene(IDs, ID_type)
    annoGene(IDs, ID_type,out_file ='tmp.html')
    annoGene(IDs, ID_type,out_file ='tmp.csv')
    
    > annoGene(IDs, ID_type,out_file ='tmp.csv')
    SYMBOL                           biotypes         ENSEMBL  chr start   end
    1 DDX11L1 transcribed_unprocessed_pseudogene ENSG00000223972 chr1 11869 14409
    5  OR4G4P             unprocessed_pseudogene ENSG00000268020 chr1 52473 53312
    7   OR4F5                     protein_coding ENSG00000186092 chr1 65419 71585
    

    小洁老师自制r包,简化全过程16-3 12:00

    devtools::install_github("xjsun1221/tinyarray")
    
    library(tinyarray)
    geo = geo_download("GSE56649")  #下载数据
    
    View(geo$pd)
    pd = geo$pd
    library(stringr)
    Group=ifelse(str_detect(pd$source_name_ch1,"control"),
                 "control",
                 "RA")  #规定分组
    #设置参考水平,指定levels,对照组在前,处理组在后
    Group = factor(Group,
                   levels = c("control","RA"))
    Group
    
    find_anno(geo$gpl,install = T)
    
    ids <- toTable(hgu133plus2SYMBOL)
    
    geo$exp = log2(geo$exp+1)
    deg = get_deg_all(geo$exp,Group,ids)
    deg$plots #出图
    
    image.png

    相关文章

      网友评论

          本文标题:差异分析16-1

          本文链接:https://www.haomeiwen.com/subject/rdzvpltx.html