美文网首页
GEO代码片段(芯片差异分析及绘图)

GEO代码片段(芯片差异分析及绘图)

作者: 郭师傅 | 来源:发表于2022-05-03 11:57 被阅读0次

    1.读入保存的表达矩阵数据

    rm(list = ls())
    path <- "F:\\myresearch\\paper_recurrence\\GSE129409"
    setwd(path)
    
    library(sva)
    library(dplyr)
    library(stringr)
    library(GEOquery)
    library(limma)
    library(styler)
    select <- dplyr::select
    
    gse_id <- "GSE129409"
    
    gse <- readRDS(paste0(".\\data\\",gse_id,"_gse.rds"))
    
    exprSet <- exprs(gse[[1]]) # 基因表达矩阵
    dim(exprSet)
    head(exprSet)
    pdata<- pData(gse[[1]])    # 分组信息,原始文件地址等
    
    gpl_id <- gse$GSE129409_series_matrix.txt.gz@annotation
    

    2.获取GPL文件并注释

    gpl <- getGEO(GEO = gpl_id,
                  destdir = "F:\\myresearch\\GEO\\GPLfiles")
    
    gpl <- gpl@dataTable@table 
    
    probe2symbol_df <- gpl %>% mutate(probe_id = .$ID,
                          symbol = .$circRNA) %>% 
      select(probe_id,symbol)
    
    # colnames(probe2symbol_df) <- c("probe_id","symbol")
    exprSet <- as.data.frame(exprSet)  #change express matrix to dataframe
    exprSet$probe_id <- rownames(exprSet)  #make a new column of probe_id by rowname
    exprSet$probe_id <- as.character(exprSet$probe_id)
    #match probe_id and gene symbol
    exprSet <- exprSet %>% 
      inner_join(probe2symbol_df,by="probe_id") %>% #合并探针的信息
      dplyr::select(-probe_id) %>% #去掉多余信息
      dplyr::select(symbol, everything()) %>% #重新排列,
      mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% #求出平均数(这边的.真的是画龙点睛)
      arrange(desc(rowMean)) %>% #把表达量的平均值按从大到小排序
      distinct(symbol,.keep_all = T) %>% # symbol留下第一个
      dplyr::select(-rowMean) %>% #反向选择去除rowMean这一列
      tibble::column_to_rownames(colnames(.)[1]) # 把第一列变成行名并删除
    
    # save(gpl,probe2symbol_df,file = "F:\\myresearch\\GEO\\GPLfiles\\GPL21825")
    

    3.数据检验和质控

    ##--------------------------------------------数据检验和聚类------------------------------
    exprSet_L <- melt(exprSet)
    colnames(exprSet_L) = c('sample','value')
    exprSet_L$group = rep(pdata$group,each = nrow(exprSet))
    
    exprSet_L[1:10,]
    ##箱线图
    p21 <-  ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_boxplot()+
      theme(axis.title.x  = element_text(face = 'italic'),
            axis.text.x =  element_text(angle = 45 , vjust = 0.5),
            legend.position = "top")
    p21            #数据不太规整
    #ggsave(p21,filename = ".\\plots\\bar_before.pdf",width = 18,height = 9,units = "cm")
    ##标准化后再图
    exprSet_1 <- normalizeBetweenArrays(exprSet) %>% data.frame()
    exprSet_L <- melt(exprSet_1)
    colnames(exprSet_L) = c('sample','value')
    exprSet_L$group = rep(pdata$group,each = nrow(exprSet))
    p22 <-  ggplot(exprSet_L,aes(x = sample, y = value,fill = group)) +geom_boxplot()+
      theme(axis.title.x  = element_text(face = 'italic'),
            axis.text.x =  element_text(angle = 45 , vjust = 0.5),
            legend.position = 0)
    p22            #表准化后规整多了,但知道效果咋样
    
    plot_qc <- plot_grid(p21,p22,labels = "AUTO",nrow = 2)
    plot_qc
    dir.create("plots")
    ggsave(plot_qc,filename = ".\\plots\\plot_qc.pdf",width = 18,height = 18,units = "cm")
    #把标准化后的数据赋值给原始数据,暂时不做
    #exprSet <- exprSet_1
    

    4.聚类图和PCA图

    # 聚类图-------------------------------------------
    colnames(exprSet) <- paste(pdata$title, sep = "")    ## 样本名称改变
    # hc <- hclust(dist(t(exprSet)))                      #hclust需要用plot画图,此例不用
    dist.r <- dist(t(exprSet))
    res <- hcut(dist.r, k = 2, stand = TRUE)
    # Visualize
    cluster_unbatch <- fviz_dend(res,
                                 # 加边框
                                 rect = TRUE,
                                 # 边框颜色
                                 rect_border = "cluster",
                                 # 边框线条类型
                                 rect_lty = 2,
                                 # 边框线条粗细
                                 lwd = 1.2,
                                 # 边框填充
                                 rect_fill = T,
                                 # 字体大小
                                 cex = 0.8,
                                 # 字体颜色
                                 color_labels_by_k = T,
                                 # 平行放置
                                 horiz = T,
                                 k_colors = "lancet",      #指定色板
                                 labels_track_height = 7,
                                 main = element_blank()
    )+theme(text = element_text(size = 8))
    cluster_unbatch
    
    # 把样本名改回来
    colnames(exprSet) <- pdata$geo_accession
    # pca图-------------------------------------------------------
    dat.pca <- PCA(as.data.frame(t(exprSet)), graph = FALSE)
    pca_plot <- fviz_pca_ind(dat.pca,
                             geom.ind = "point", # show points only (nbut not "text")
                             col.ind = group_list, # color by groups
                             #palette = c("#00AFBB", "#E7B800"),  #修改颜色
                             addEllipses = TRUE, # Concentration ellipses
                             # legend.title = "GROUP"
    )+theme(legend.position = c(0.80,1),
            text = element_text(size = 8),
            legend.key.size = unit(3,"mm"),
            legend.title = element_blank()
    )
    pca_plot
    ggsave(plot = pca_plot,,width = 89,height = 89,units = "mm",filename = paste0(".\\plots\\",gse_id,"PCA.pdf"))
    

    5.差异分析

    ##--------------------------------------------开始差异分析----------------------------------------
    #表达矩阵:exprSet
    #分组列表:group_list
    
    design <- model.matrix(~0+factor(group_list))
    # design <- model.matrix(~group)
    colnames(design) <- levels(factor(group_list))
    
    fit <- lmFit(exprSet,design)
    
    # makeContrasts里边组的顺序会影响结果
    # 根据目前资料,实验组写左,对照组写右
    group_list
    cont.matrix<-makeContrasts(atrial_fibrillation-healthy_control,levels = design)
    cont.matrix
    #做完后实验组是1,对照组是-1,此处注释有待分析
    fit2=contrasts.fit(fit,cont.matrix)
    fit2 <- eBayes(fit2)  ## default no trend !!!
    ##eBayes() with trend=TRUE
    #通过修改p.value,改变输出基因,过小可能无输出。实际影响的是adj.p,输出基因的不同可能影响火山图的起点(根部)
    tempOutput = topTable(fit2,coef=1,n=Inf,adjust="BH",p.value = 0.5)    
    #tempOutput1 = topTable(fit2,coef=1,n=Inf,adjust="fdr",p.value = 0.05) 
    nrDEG = na.omit(tempOutput) 
    
    allDiff <- nrDEG
    

    6.基因筛选

    # 基因筛选-------------------------------------------------
    # 定义加上下调标记函数,适用于limma包结果
    add_sign_p <- function(df){
      df <- df %>% mutate(sign = dplyr::case_when(df$P.Value < p_cutoff & df$logFC> logFC_cutoff ~ "up",
                                                  df$P.Value < p_cutoff & df$logFC< -logFC_cutoff ~ "down",
                                                  TRUE ~ "stable")
      )
      return(df)
    }
    
    add_sign <- function(df){
      df <- df %>% mutate(sign = dplyr::case_when(df$adj.P.Value < p_cutoff & df$logFC> logFC_cutoff ~ "up",
                                                  df$adj.P.Value < p_cutoff & df$logFC< -logFC_cutoff ~ "down",
                                                  TRUE ~ "stable")
      )
      return(df)
    }
    
    logFC_cutoff=1                    #fold change=1意思是差异是两倍
    p_cutoff = 0.05                     #padj=0.05意思是矫正后P值小于0.05
    
    diff <- add_sign_p(allDiff )
    
    diffSig <- diff %>% filter(sign == "down" | sign == "up")
    save(diff,diffSig,diffUp,diffDown,file = paste0(".\\data\\",gse_id,"deg.RData"))
    

    7.绘制火山图

    #-------------------------------------------绘制火山图---------------------------------------
    von_1 <- ggplot(
      #设置数据
      diff, 
      aes(x = logFC, y = -log10(P.Value), colour=change)) +                 #P.value或adj.P.Val需要根据情况修改
      geom_point(alpha=0.4, size=2) +
      scale_color_manual(values=c("#546de5", "#d2dae2","#ff4757"))+
      
      # 辅助线,根据具体情况修改intercept数值
      geom_vline(xintercept=c(-2,2),lty=4,col="black",lwd=0.6) +
      geom_hline(yintercept = -log10(pvalue),lty=4,col="black",lwd=0.6) +
      
      # 坐标轴
      labs(x="log2(fold change)",
           y="-log10 (p-value)")+
      theme_bw()+
      # 图例
      theme(plot.title = element_text(hjust = 0.5), 
            legend.position = c(0.85,0.85), 
            legend.title = element_blank(),
            legend.background = element_blank(),
            legend.box.background = element_blank(),
            text = element_text(size = 8),
            
            
      )
    von_1
    # ggsave(von_1,filename = str_c(".\\plots\\",gse_id,"_volcano_padj.pdf"),
    #        width = 9,height = 9,units = "cm")
    
    # 定义显示的标记基因
    # 将需要标记的基因放置在label列
    # 这里设置logFC值大于3的差异基因来标记
    # !!!需要注意的是标记的基因不能太多,Rstudio容易卡死,提前用summary()查看
    # P.value或adj.P.Val需要根据情况修改,包括pvalue或padj
    diff$label = ifelse(diff$P.Value < pvalue & abs(diff$logFC) >=3, as.character(rownames(diff)),"")
    
    von_labeled <- von_1+geom_text_repel(data = diff, aes(x = logFC, 
                                                          y = -log10(P.Value), 
                                                          label = label),
                                         size = 2,box.padding = unit(0.5, "lines"),
                                         point.padding = unit(0.8, "lines"), 
                                         segment.color = "black", 
                                         show.legend = FALSE)
    von_labeled
    ggsave(von_labeled,
           width = 9,height = 9,units = "cm",
           filename = str_c(".\\plots\\",gse_id,"_volcano_padj_labeled.pdf"))
    

    8.绘制热图

    ##-----------------------------------------heatmap plot-----------------------------------------
    library(pheatmap)
    ## 设定差异基因阈值,减少差异基因用于提取表达矩阵
    allDiff_pair=topTable(fit2,adjust='BH',coef= 1,number=Inf,p.value=0.05,lfc =1) 
    ##提前部分数据用作热图绘制
    heatdata <- exprSet[rownames(diffSig),]
    top100 <- heatdata
    
    ##制作一个分组信息用于注释,行名是标本号,值是分组
    annotation_col <- data.frame(group_list)
    
    rownames(annotation_col) <- colnames(exprSet)
    
    ##如果注释出界,可以通过调整格子比例和字体修正
    heatmap <- pheatmap(top100, #热图的数据
                        cluster_rows = T,#行聚类
                        cluster_cols = TRUE,#列聚类,可以看出样本之间的区分度
                        annotation_col = annotation_col, #标注样本分类
                        annotation_legend = TRUE, # 显示注释
                        show_rownames = T,# 显示行名
                        show_colnames = T,# 显示列名
                        scale = "row", #以行来标准化,这个功能很不错
                        color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
                        border_color = FALSE,
                        fontsize = 7,
                        legend = T) 
    dim(top100)
    
    ggsave(heatmap,filename = str_c(".\\plots\\",gse_id,"_deg_heatmap.pdf"),
           width = 13,height = 9,units = "cm")
    

    相关文章

      网友评论

          本文标题:GEO代码片段(芯片差异分析及绘图)

          本文链接:https://www.haomeiwen.com/subject/spvbyrtx.html