美文网首页GEO数据挖掘
GEO数据分析之GSEA

GEO数据分析之GSEA

作者: 天涯清水 | 来源:发表于2020-01-06 22:38 被阅读0次

    GSEA-analysis

    1.加载数据

    载入前一步分析得到的表达矩阵
    library(ggstatsplot);
    library(cowplot);
    library(clusterProfiler);
    library(stringr);
    library(dplyr);
    library(tidyr);
    library(ggplot2);
    library(ggstatsplot);
    load(file = 'GSE63067_GSEA.Rdata')#导入上一步分析的数据
    exprSet <- data_plot
    exprSet[1:3,1:3]
    
    ##                PAX8   CYP2A6   SCARB1
    ## GSM1539877 6.506860 11.94711 9.129116
    ## GSM1539878 6.313513 11.82544 9.402811
    ## GSM1539879 6.273058 11.42314 8.120977
    

    2.批量相关性分析

    将第一行目的基因跟其他行的编码基因批量做相关性分析,得到相关性系数以及p值。
    y <- as.numeric(exprSet[,"CCL20"])
    
    colnames <- colnames(exprSet)
    
    cor_data_df <- data.frame(colnames)
    
    for (i in 1:length(colnames)){
    
      test <- cor.test(as.numeric(exprSet[,i]),y,type="spearman")
    
      cor_data_df[i,2] <- test$estimate
    
      cor_data_df[i,3] <- test$p.value
    
    }
    
    names(cor_data_df) <- c("symbol","correlation","pvalue")
    
    # 查看这个数据结构
    
    head(cor_data_df)
    
    ##   symbol correlation      pvalue
    ## 1   PAX8 -0.23354999 0.350963277
    ## 2 CYP2A6 -0.60172099 0.008244347
    ## 3 SCARB1 -0.19907443 0.428394688
    ## 4 TTLL12 -0.57277340 0.012974684
    ## 5  CYTOR  0.35144428 0.152686677
    ## 6 ADAM32 -0.01286106 0.959604984
    

    3.筛选最相关的基因

    筛选p值小于0.05,按照相关性系数绝对值选前500个的基因, 数量可以自己定。
    cor_data_sig <- cor_data_df %>% 
    
      filter(pvalue < 0.05) %>% 
    
      arrange(desc(abs(correlation)))%>% 
    
      dplyr::slice(1:500)
    

    4.随机选取正的和负的分别作图验证

    正相关的选取IL2RG;负相关选取MARK1
    #正相关的选取IL2RG
    
    ggscatterstats(data = exprSet, 
    
                   y = CCL20, 
    
                   x = IL2RG,
    
                   centrality.para = "mean",                              
    
                   margins = "both",                                         
    
                   xfill = "#CC79A7", 
    
                   yfill = "#009E73", 
    
                   marginal.type = "histogram",
    
                   title = "Relationship between CCL20 and IL2RG")
    
    ## Warning: This plot can't be further modified with `ggplot2` functions.
    ## In case you want a `ggplot` object, set `marginal = FALSE`.
    
    #负相关的选取MARK1
    
    ggscatterstats(data = exprSet, 
    
                   y = CCL20, 
    
                   x = MARK1,
    
                   centrality.para = "mean",                              
    
                   margins = "both",                                         
    
                   xfill = "#CC79A7", 
    
                   yfill = "#009E73", 
    
                   marginal.type = "histogram",
    
                   title = "Relationship between CCL20 and IL2RG")
    
    ## Warning: This plot can't be further modified with `ggplot2` functions.
    ## In case you want a `ggplot` object, set `marginal = FALSE`.
    
    image.png
    #还可以用cowplot拼图
    
    p1 <- ggscatterstats(data = exprSet, 
    
                         y = CCL20, 
    
                         x = IL2RG,
    
                         centrality.para = "mean",                              
    
                         margins = "both",                                         
    
                         xfill = "#CC79A7", 
    
                         yfill = "#009E73", 
    
                         marginal.type = "histogram",
    
                         title = "Relationship between CCL20 and IL2RG")
    
    ## Warning: This plot can't be further modified with `ggplot2` functions.
    ## In case you want a `ggplot` object, set `marginal = FALSE`.
    
    p2 <- ggscatterstats(data = exprSet, 
    
                         y = CCL20, 
    
                         x = MARK1,
    
                         centrality.para = "mean",                              
    
                         margins = "both",                                         
    
                         xfill = "#CC79A7", 
    
                         yfill = "#009E73", 
    
                         marginal.type = "histogram",
    
                         title = "Relationship between CCL20 and IL2RG")
    
    ## Warning: This plot can't be further modified with `ggplot2` functions.
    ## In case you want a `ggplot` object, set `marginal = FALSE`.
    
    plot_grid(p1,p2,nrow = 1,labels = LETTERS[1:2])
    
    image.png

    5.聚类分析

    既然确定了相关性是正确的,那么用筛选的基因进行富集分析就可以反推这个基因的功能。
    #获得基因列表
    
    gene <- str_trim(cor_data_sig$symbol,'both')
    
    #基因名称转换,返回的是数据框
    
    gene = bitr(gene, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
    
    go <- enrichGO(gene = gene$ENTREZID, OrgDb = "org.Hs.eg.db", ont="all")
    
    # 这里因为是计算的所有GO分析的三个分类,所以可以合并作图
    
    # 这是条形图
    
    barplot(go, split="ONTOLOGY")+ 
      facet_grid(ONTOLOGY~., scale="free")
    
    image.png
    # 这是气泡图
    
    dotplot(go, split="ONTOLOGY")+ 
      facet_grid(ONTOLOGY~., scale="free")
    
    image.png
    # 
    # 这时候,我们能推断CCL20这个基因主要参与免疫调控和T细胞激活,细胞因子受体活性调剂等功能,大致跟她本身的功能是一致的。 
    

    参考

    相关文章

      网友评论

        本文标题:GEO数据分析之GSEA

        本文链接:https://www.haomeiwen.com/subject/mcelactx.html