美文网首页GEO数据挖掘
第一步:简单的GEO数据提取及数据分析

第一步:简单的GEO数据提取及数据分析

作者: 碌碌无为的杰少 | 来源:发表于2020-05-20 16:12 被阅读0次

    简介

    这是复现一篇结肠癌的文献图例,GSE4107,文献地址https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4399548/pdf/ott-8-745.pdf,其中由于作者细节未公布,可能差异基因个数有稍许差别

    数据提取

    利用getGEO函数下载GSE4107,用exprs函数获的exp表达矩阵,pData函数获取临床信息pd临床数据,@annotation获得gpl平台数据。

    #数据下载
    rm(list = ls())
    options(stringsAsFactors = F)
    library(GEOquery)
    gse = "GSE4107"
    eSet <- getGEO(gse, 
                   destdir = '.', 
                   getGPL = F)
    #(1)提取表达矩阵exp
    exp <- exprs(eSet[[1]])
    exp[1:4,1:4]
    exp = log2(exp+1)
    #(2)提取临床信息
    pd <- pData(eSet[[1]])
    #(3)调整pd的行名顺序与exp列名完全一致
    p = identical(rownames(pd),colnames(exp));p
    if(!p) exp = exp[,match(rownames(pd),colnames(exp))]
    #(4)提取芯片平台编号
    gpl <- eSet[[1]]@annotation
    save(gse,pd,exp,gpl,file = "step1output.Rdata")
    

    获取GPL平台信息

    首先利用pd中的title信息获取临床信息并factor化,再利用GPL获取探针信息。

    rm(list = ls())  
    load(file = "step1output.Rdata")
    library(stringr)
    pd$title
    group_list=ifelse(str_detect(pd$title,"healthy control"),"control","treat")
    #设置参考水平,对照在前,处理在后
    group_list = factor(group_list,
                        levels = c("control","treat"))
    #2.ids-----------------
    #方法1 BioconductorR包
    gpl 
    #http://www.bio-info-trainee.com/1399.html
    if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
    library(hgu133plus2.db)
    ls("package:hgu133plus2.db")
    ids <- toTable(hgu133plus2SYMBOL)
    head(ids)
    # 方法2 读取gpl页面的soft文件,按列取子集
    # 方法3 官网下载
    # 方法4 自主注释 
    save(exp,group_list,ids,file = "step2output.Rdata")
    

    制作PCA图和热图

    这一步只要提供expgrou_list其他不用变

    rm(list = ls())  
    load(file = "step1output.Rdata")
    load(file = "step2output.Rdata")
    boxplot(exp,outline=FALSE,notch=T,las=2)
    #输入数据:exp和group_list
    #Principal Component Analysis
    #http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials
    
    dat=as.data.frame(t(exp))
    library(FactoMineR)#画主成分分析图需要加载这两个包
    library(factoextra) 
    # pca的统一操作走起
    dat.pca <- PCA(dat, graph = FALSE)
    pca_plot <- fviz_pca_ind(dat.pca,
                             geom.ind = "point", # show points only (nbut not "text")
                             col.ind = group_list, # color by groups
                             #palette = c("#00AFBB", "#E7B800"),
                             addEllipses = TRUE, # Concentration ellipses
                             legend.title = "Groups"
    )
    pca_plot
    ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
    save(pca_plot,file = "pca_plot.Rdata")
    
    #热图 
    cg=names(tail(sort(apply(exp,1,sd)),1000))
    n=exp[cg,]
    
    #绘制热图
    annotation_col=data.frame(group=group_list)
    rownames(annotation_col)=colnames(n) 
    library(pheatmap)
    pheatmap1 <- pheatmap(n,
             show_colnames =F,
             show_rownames = F,
             annotation_col=annotation_col,
             scale = "row")
    ggsave(plot = pheatmap1,filename = paste0(gse,"pheatmap1.png"))
    

    做差异分析

    利用贝叶斯获取表达差异矩阵,用inner_join获得探针信息,获得deg表达。

    rm(list = ls()) 
    load(file = "step2output.Rdata")
    #差异分析,用limma包来做
    #需要表达矩阵和group_list,不需要改
    library(limma)
    design=model.matrix(~group_list)
    fit=lmFit(exp,design)
    fit=eBayes(fit)
    deg=topTable(fit,coef=2,number = Inf)
    
    #为deg数据框添加几列
    #1.加probe_id列,把行名变成一列
    library(dplyr)
    deg <- mutate(deg,probe_id=rownames(deg))
    head(deg)
    #2.加symbol列,火山图要用
    deg <- inner_join(deg,ids,by="probe_id")
    head(deg)
    deg <- deg[!duplicated(deg$symbol),]
    #3.加change列,标记上下调基因
    logFC_t=1
    P.Value_t = 0.05
    k1 = (deg$adj.P.Val < P.Value_t)&(deg$logFC < -logFC_t)
    k2 = (deg$adj.P.Val < P.Value_t)&(deg$logFC > logFC_t)
    table(k1)
    table(k2)
    change = ifelse(k1,"down",ifelse(k2,"up","stable"))
    deg <- mutate(deg,change)
    

    火山图

    提供了差异前几基因或者某一单独基因的实现方法

    load(file = "step1output.Rdata")
    load(file = "step4output.Rdata")
    #1.火山图----
    library(dplyr)
    library(ggplot2)
    
    dat  = deg
    if(T) {
      x1 = dat %>% 
        filter(change == "up") %>% 
        arrange(desc(logFC))%>%
        head(3)
      x2 = dat %>% 
        filter(change == "down") %>% 
        arrange(logFC)%>%
        head(3)
      for_label = rbind(x1,x2)
    }
    
    #%>%
    if(F){
      for_label <- dat%>% 
        filter(symbol %in% c("VIP","SFRP1")) 
    }
    if(F){
      for_label <- dat %>% head(10)
    }
    if(F) {
      x1 = dat %>% 
        filter(change == "up") %>% 
        head(3)
      x2 = dat %>% 
        filter(change == "down") %>% 
        head(3)
      for_label = rbind(x1,x2)
    }
    
    #  head(10)
    p <- ggplot(data = dat, 
                aes(x = logFC, 
                    y = -log10(adj.P.Val))) +
      geom_point(alpha=0.4, size=3.5, 
                 aes(color=change)) +
      ylab("-log10(Pvalue)")+
      scale_color_manual(values=c("blue", "grey","red"))+
      geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
      geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
      theme_bw()+
      theme(panel.border = element_blank(),panel.grid.major = element_blank(),
            
             panel.grid.minor = element_blank(),axis.line = element_line(colour = "black"))
    p
    volcano_plot <- p +
      geom_point(size = 3, shape = 1, data = for_label) +
      ggrepel::geom_label_repel(
        aes(label = symbol),
        data = for_label,
        color="black")
    ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))
    
    image.png

    热图和拼图

    热图提供前多少基因或者所有差异基因的热图,if函数提供了标记列名的函数,最后用patchwork进行拼图

    load(file = 'step2output.Rdata')
    dd1=deg
    x=dd1$logFC 
    names(x)=dd1$probe_id
    cg=c(names(head(sort(x),30)),names(tail(sort(x),30)))
    cg
    #cg = deg$probe_id[deg$change !="stable"]
    rownames(dd1) <- dd1$probe_id
    dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
    dd2
    if(F) {
      rownames(dd1) <- dd1$probe_id
      dd2 <- dd1[c(names(head(sort(x),30)),names(tail(sort(x),30))),"symbol"]
      dd2
    }
    n=exp[cg,]
    #rownames(n) <- dd2
    dim(n)
    #作热图
    library(pheatmap)
    annotation_col=data.frame(group=group_list)
    rownames(annotation_col)=colnames(n) 
    library(ggplotify)
    heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
                                       show_rownames = T,
                                       scale = "row",
                                       #cluster_cols = F, 
                                       annotation_col=annotation_col
                                  )) 
    heatmap_plot
    ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))
    load("pca_plot.Rdata")
    library(patchwork)
    (pca_plot | volcano_plot )/heatmap_plot+ plot_annotation(tag_levels = "A")
    Cggsave(filename = "test.png",width = 15,height = 5)
    
    image.png

    相关文章

      网友评论

        本文标题:第一步:简单的GEO数据提取及数据分析

        本文链接:https://www.haomeiwen.com/subject/lolvohtx.html