GEO——代码复现

作者: monkey_study | 来源:发表于2022-04-18 23:02 被阅读0次

    文章题目:Identification of the interaction network of hub genes for melanoma treated with vemurafenib based on microarray data####

    GEO42872 GPL6244

    B站学习了生信技能树jimmy老师的代码后,自己尝试运行复现一遍,从下载数据(GEOquery/GEOmirror)到检查数据(boxplot/PCA),分组的构建,接着是标准化流程:差异分析(可视化展示:热图、火山图),功能富集分析(GO、KEGG,这里用的是y叔的clusterProfiler包),GSEA(gene set enrichment analysis)。其中,火山图画的感觉还不错,哈哈!

    
    rm(list = ls())  #一键清空
    options(stringsAsFactors = F) #在调用as.data.frame的时,将stringsAsFactors设置为FALSE可以避免character类型自动转化为factor类型
    ##加载包
    library(GEOmirror)
    suppressMessages(library(GEOquery))
    #GSE42872数据下载
    f <- 'GSE42872_eSet.Rdata'
    if (T) {
    gse <- geoChina(gse = "GSE42872")
    save(gse,file=f)
    }
    load(f)
    exprSet <- exprs(gse[[1]])  #提取表达矩阵赋值给exprSet
    #查看数据
    dim(exprSet)  #行33297     列6
    head(exprSet)
    range(exprSet)   # [1]  2.66517 14.56410  初步判断数据可能经过了log转换
    boxplot(exprSet,las=0)  #已经过标准化,las参数调整横轴标签方向0横,1横,2纵,3纵
    
    #提取临床信息
    pdat <- pData(gse[[1]])
    View(pdat)
    library(stringr)  #字符串处理包
    group_list <- str_split(pdat$title," ",simplify = T)[,4]  #提取分组信息
    table(group_list)  #查看分组
    if (F) {
    ##注释包/GEO注释平台 GPL6244
    BiocManager::install("hugene10sttranscriptcluster.db")  #下载对应注释包
    library(hugene10sttranscriptcluster.db)  #加载包
    ls("package:hugene10sttranscriptcluster.db") #查看包得内容
    ids <- toTable(hugene10sttranscriptclusterSYMBOL)  #toTable函数提取探针id,symbol对应关系
    head(ids)  #查看前六行 
    exprs
    dim(ids) # 19869     2
    ids<- ids[ids$symbol!=" ",] # 19869     2 提取symbol 不为空得行
    ids <- ids[ids$probe_id %in%  rownames(exprSet),] #19869 
    exprSet[1:4,1:4]
    head(ids)
    exprSet <- exprSet[rownames(exprSet) %in%ids$probe_id,  ]
    dim(exprSet) # 19869     6
    table(sort(table(ids$symbol))) #查看基因探针对应关系
    # 1     2     3     4     5     6     7     8    10    26 
    # 18098   597   131    15     7     5     1     2     1     1 
    #由于多个探针对应一个基因,所以取基因在样本中的均值并排序取最大得作为唯一探针与基因对应。
    ##加载注释函数
    anno <- function(exprSet,ids){
      probes <- as.character(by(exprSet,ids$symbol,function(x)rownames(x)[which.max(rowMeans(x))]))
      exprSet=exprSet[rownames(exprSet) %in% probes ,]
      rownames(exprSet)=ids[ids$probe_id%in%rownames(exprSet),2]
      return(exprSet)
    }
    exprSet<- anno(exprSet,ids)
    exprSet[1:4,1:4]  #查看表达矩阵
    #保存注释后的表达矩阵以及分组信息
    }
    save(exprSet,group_list,file = "output.Rdata")
    
    
    ##检查分组是否合适,PCA。
    load("output.Rdata")
    dat<- exprSet
    ## 下面是画PCA的必须操作,需要看说明书。
    #画PCA图时要求是行名时样本名,列名时探针名,因此此时需要转换。将matrix转换为data.frame
    dat <- cbind(as.data.frame(t(dat)),group_list) #cbind横向追加,即将分组信息追加到最后一列
    library("FactoMineR")#画主成分分析图需要加载这两个包
    library("factoextra") 
    dim(dat)  #[1]     6 18859
    dim(exprSet)  #[1] 18858     6
    # The variable group_list is removed before PCA analysis
    dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#现在dat最后一列是group_list,需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
    fviz_pca_ind(dat.pca,
                 geom.ind = "point", # 图形显示形式,可以是point,text
                 col.ind = dat$group_list, # color by groups
                 # palette = c("#00AFBB", "#E7B800"),
                 addEllipses = F, # 图例是否加框
                 legend.title = "Groups"
    )  #PCA可视化fviz_pca_ind
    ggsave('all_samples_PCA.png',width = 600,height = 600,units = "mm")
    
    ###DEGs package limma input: 1.expression matrix 2.design matrix 3. contrast matrix
    suppressMessages(library(limma))
    #1.expression matrix :exprSet
    exprSet[1:4,1:4]
    #2 design matrix
    table(group_list)
    design <- model.matrix(~0+factor(group_list,levels = c("Control","Vemurafenib"),ordered = T))
    head(design)
    colnames(design) <- levels(factor(group_list))
    rownames(design) <- colnames(exprSet)
    #3. contrast matrix
    contrast <- makeContrasts(paste0(rev(levels(factor(group_list))),collapse = "-"),
                              levels =design)
    #DEGs
    DEGs <- function(exprSet,design,contrast){
    # step1
    fit <- lmFit(exprSet,design)
    # step2
    fit1 <- contrasts.fit(fit,contrast)
    # step3
    fit2 <- eBayes(fit1)
    nrDEG  <- na.omit(topTable(fit2,coef = paste0(rev(levels(factor(group_list))),collapse = "-"),number = Inf,adjust.method = "BH"))
    return(nrDEG)
    }
    nrDEG <- DEGs(exprSet,design,contrast)
    save(nrDEG,file = 'nrDEG.Rdata')
    table(nrDEG$adj.P.Val<0.01&nrDEG$logFC>=2)
    table(nrDEG$adj.P.Val<0.01&nrDEG$logFC<=-2)
    
    #valcone plot
    if(T){ 
      library(dplyr)
      library(tidyverse)
      library(ggplot2)
      DEG_valvone <- nrDEG
      colnames(DEG_valvone)
      DEG_valvone <- DEG_valvone %>%
        select(logFC,AveExpr, adj.P.Val) %>%
        mutate(v= -log10(adj.P.Val),
               gene_type = case_when(logFC >= 2 & adj.P.Val < 0.01 ~ "up",
                                     logFC <= -2 & adj.P.Val < 0.01 ~ "down",
                                     TRUE ~ "stable")) 
      ##查看x坐标轴设定范围
      DEG_valvone %>%
        pull(logFC) %>%
        min() %>%
        floor()   #查看x轴最小边界 -5
      DEG_valvone %>%
        pull(logFC) %>%
        max() %>%
        ceiling()  #最大边界6
      
      ###设定上调下调稳定点颜色大小透明度
      cols <- c("up" = "#ffad73", "down" = "#26b3ff", "stable" = "grey") 
      sizes <- c("up" = 2, "down" = 2, "stable" = 1) 
      alphas <- c("up" = 1, "down" = 1, "stable" = 0.5)
    }
    final_plot <- DEG_valvone %>%
      ggplot(aes(x = logFC,
                 y = v,
                 fill = gene_type,    
                 size = gene_type,
                 alpha = gene_type)) + 
      geom_point(shape = 21, # Specify shape and colour as fixed local parameters    
                 colour = "black") + 
      geom_hline(yintercept = -log10(0.01),
                 linetype = "dashed") + 
      geom_vline(xintercept = c(-2, 2),
                 linetype = "dashed") +
      scale_fill_manual(values = cols) + # Modify point colour
      scale_size_manual(values = sizes) + # Modify point size
      scale_alpha_manual(values = alphas) + # Modify point transparency
      scale_x_continuous(breaks = c(seq(-6, 6, 1)),       
                         limits = c(-6, 6))+
      labs(title = "Gene expression changes in Vemurafenib versus control samples",
           x = "log2(fold change)",
           y = "-log10(adjusted P-value)",
           colour = "Expression \nchange") +
      theme_bw() + # Select theme with a white background  
      theme(panel.border = element_rect(colour = "black", fill = NA, size= 0.5),    
            panel.grid.minor = element_blank(),
            panel.grid.major = element_blank()) 
    final_plot
    ggsave("valcone_plot.png",plot = final_plot,width = 200,height = 150,units ="mm" )
    
    
    ## Plot MA plots
    if(T){DEG_valvone %>%
        pull(logFC) %>%
        min() %>%
        floor()   #查看x轴最小边界 -5
      DEG_valvone %>%
        pull(logFC) %>%
        max() %>%
        ceiling()  #最大边界6
      res <- DEG_valvone[with(DEG_valvone, order(logFC)),]  #按照logFC倒序排列
      res$threshold <- as.factor(res$adj.P.Val < 0.01)
      MAplot<- ggplot(data=res, aes(x=AveExpr, y=logFC, colour=threshold)) + 
        geom_point(alpha=0.4, size=1.8) + 
        geom_hline(aes(yintercept = 0), colour = "blue", size = 1.2) +
        ylim(c(-5,6)) + 
        labs(title = "MA plot between Vemurafenib and control samples",
             x = "Mean expression",
             y = "Log2 Fold Change")+
        theme(axis.title.x = element_text(face = "bold", size = 15),
              axis.text.x = element_text(face = "bold", size = 12)) +
        theme(axis.title.y = element_text(face = "bold", size = 15),
              axis.text.y = element_text(face = "bold", size = 12)) +
        scale_colour_discrete(name = "p.adjusted < 0.01") +
        theme(legend.title = element_text(face = "bold", size = 15)) +
        theme(legend.text = element_text(size = 14))
      ggsave("MA_plot.png",plot = MAplot,width = 200,height = 200,units ="mm" )
    }
    
    
    #GO enrichment analysis
    load("nrDEG.Rdata")
    head(nrDEG)
    if(T){ 
      suppressMessages(library(org.Hs.eg.db,
                               ggplot2,
                               dplyr,
                               tidyverse))
      nrDEG <- nrDEG %>%
        mutate(v= -log10(adj.P.Val),
               gene_type = case_when(logFC >= 2 & adj.P.Val < 0.01 ~ "up",
                                     logFC <= -2 & adj.P.Val < 0.01 ~ "down",
                                     TRUE ~ "stable")) 
      save(nrDEG,file="deg.Rdata")
    }
    load("deg.Rdata")
    head(nrDEG)
    
    nrDEG$SYMBOL <- rownames(nrDEG)
    df <- bitr(nrDEG$SYMBOL, fromType = "SYMBOL",
               toType = c( "ENTREZID"),
               OrgDb = org.Hs.eg.db) 
    nrDEG=merge(nrDEG,df,by.y='SYMBOL',by.x='SYMBOL')
    gene_up <- nrDEG[nrDEG$gene_type=="up",'ENTREZID']
    gene_down <- nrDEG[nrDEG$gene_type=="down",'ENTREZID']
    gene_df <- c(gene_up,gene_down)
    gene_all<- as.character(nrDEG[,'ENTREZID'])
    #GO
    if(F){ego_BP <- enrichGO(gene = gene_up,  #entrez gene id
                       keyType = "ENTREZID", 
                       OrgDb= org.Hs.eg.db,
                       ont = "BP",  #生物学功能
                       pAdjustMethod = "BH", 
                       universe=gene_all,
                       minGSSize = 10,
                       maxGSSize = 500,
                       pvalueCutoff = 0.05,  #adjust pvalue
                       qvalueCutoff = 0.05) #pvalue test value
    ego_MF <- enrichGO(gene = gene_up,  #entrez gene id
                       keyType = "ENTREZID", 
                       OrgDb= org.Hs.eg.db,
                       ont = "MF",  #生物学功能
                       pAdjustMethod = "BH", 
                       universe=gene_all,
                       minGSSize = 10,
                       maxGSSize = 500,
                       pvalueCutoff = 0.05,  #adjust pvalue
                       qvalueCutoff = 0.05) #pvalue test value
    ego_MF <- enrichGO(gene = gene_up,  #entrez gene id
                       keyType = "ENTREZID", 
                       OrgDb= org.Hs.eg.db,
                       ont = "CC",  #生物学功能
                       pAdjustMethod = "BH", 
                       universe=gene_all,
                       minGSSize = 10,
                       maxGSSize = 500,
                       pvalueCutoff = 0.05,  #adjust pvalue
                       qvalueCutoff = 0.05) 
    }
    ego_all <- enrichGO(gene = gene_df,  #entrez gene id
                       keyType = "ENTREZID", 
                       OrgDb= org.Hs.eg.db,
                       ont = "CC",  #生物学功能
                       pAdjustMethod = "BH", 
                       universe=gene_all,
                       minGSSize = 10,
                       maxGSSize = 500,
                       pvalueCutoff = 0.05,  #adjust pvalue
                       qvalueCutoff = 0.05,
                       readable = T)
    save(ego_all,file = "GO_result.Rdata")
    
    #可视化
    dotplot(ego_all)   #气泡图
    cnetplot(ego_all)
    barplot(ego_all)  #条带图
    cnetplot(ego_all, categorySize="pvalue",colorEdge = TRUE,circular = TRUE) #简易弦图
    
    ##KEGG
    geo_kegg <- enrichKEGG(gene_df,
                           organism = "hsa",keyType = "kegg",
                           pvalueCutoff = 0.05,
                           pAdjustMethod = "BH",
                           universe=gene_all,
                           minGSSize = 10,
                           maxGSSize = 500,
                           qvalueCutoff =2,
                           use_internal_data = FALSE
    )
    #查看富集结果
    table(geo_kegg@result$p.adjust<0.05)  #1个
    dotplot(geo_kegg)
    

    相关文章

      网友评论

        本文标题:GEO——代码复现

        本文链接:https://www.haomeiwen.com/subject/jmpjertx.html