美文网首页Bio_Methods
【年末福利】以单基因生信分析切入,分享ssgsva、GSEA等套

【年末福利】以单基因生信分析切入,分享ssgsva、GSEA等套

作者: 研平方 | 来源:发表于2022-01-23 22:00 被阅读0次

    1. 套路一:观察重要表型与感兴趣基因之间的关系。

    比如观察KRAS有无突变两组之间,感兴趣基因的表达是否有差异。

    1.1 整理数据:提取感兴趣基因和临床表型数据

    expr <- exprs[exprs$symbol %in% c("Your Interesting Gene Symbol"),]
    rownames(expr) <- expr$symbol
    expr <- expr[,-1]
    expr <- as.data.frame(t(expr))
    str(expr)
    expr$Source.Name <- rownames(expr)
    comData <- merge(expr,pheno,by="Source.Name") #合并表达和表型数据
    

    1.2 以表型为分组,观察感兴趣基因之间是否具有差异表达。

    ibrary(reshape2)
    library(gridExtra)
    library(ggthemes)
    library(ggplot2)
    library(ggpubr)
    library(ggsignif)
    table(comData$Characteristics.krasmut.)
    df <- comData[comData$Characteristics.krasmut. !="not available",]
    table(df$Characteristics.krasmut.)
    
    plot <- ggplot(df,aes(x=Characteristics.krasmut., y="Your Gene", fill=Characteristics.krasmut.))+
      geom_boxplot(aes(fill=Characteristics.krasmut.), outlier.shape = NA)+
      geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+ 
      stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+
      theme_pubclean() +
      scale_x_discrete(name="")+  
      theme(plot.title = element_text(size = 13, face = "bold"),
            legend.position = "right",
            text = element_text(size = 13),
            axis.title = element_text(face="bold"),
            axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0)) 
    plot
    

    2. 套路二:生存分析

    library(survival)
    library(survminer)
    library(ggthemes)
    ## remove the "not available" data 
    df <- comData[comData$Characteristics.os.delay. !="not available",]
    df$os <- as.numeric(df$Characteristics.os.delay.)
    df$event <- as.numeric(df$Characteristics.os.event.)
    
    survi_cutoff <- surv_cutpoint(data = df, time = "os", event = "event",
                                  variables = "Your gene") 
    summary(survi_cutoff)
    sur_cut <- surv_categorize(survi_cutoff)
    head(sur_cut)
    sur_cut$os <- as.numeric(sur_cut$os)
    sur_cut$event <- as.numeric(sur_cut$event)
    fit <- survfit(Surv("os", "event") ~Your gene, data = sur_cut)
    ggsurvplot(fit, data = sur_cat, conf.int = T,  # 95%CI
               size = 0.6,  # change line size
               pval = T,  # p-value of log-rank test
               risk.table = TRUE,  
               xlab=c("Overall survival(months)"), 
               ggtheme = theme_gray(), # theme_minimal() theme_gray() theme_classic() theme_bw()
               legend.title="Your gene", legend.labs=c("High","Low") )
    

    3. 套路三:ssgsva

    3.1 make genelist and load expression data

    ## genelist 
    rm(list = ls())
    library(GSVA)
    library(GSEABase)
    library(msigdbr)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    library(enrichplot)
    library(limma)
    library(readxl)
    genelist <- read_excel("genelist.xlsx")
    genelist <- as.data.frame(genelist)
    pathway_list <- lapply(genelist, function(x) {
      unique(na.omit(x)) 
    })
    
    ## expression data
    load("expr.Rdata")
    row.names(expr) <- expr$symbol
    exprs <- dplyr::select(expr, -symbol)
    gsva <- gsva(as.matrix(exprs), pathway_list,method='ssgsea',
                           kcdf='Gaussian',abs.ranking=TRUE)
    write.csv(gsva, file = "gsva.csv")
    
    ## setting group according your interesting gene expression
    df <- exprs["Your Gene",]
    df <- as.data.frame(t(df))
    df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")
    table(df$group)
    gsva_matrix <- t(gsva)
    
    ## Make sure the samples are in the correct order
    df <- df[rownames(gsva),]
    gsva_matrix <- cbind(gsva_matrix,df)
    

    3.2 make analysis according the gsva result

    library(reshape2)
    library(gridExtra)
    library(ggthemes)
    library(ggplot2)
    library(ggpubr)
    library(ggsignif)
    table(gsva_matrix$group)
    rt <- gsva_matrix[,c(1:29,31)]
    df <- melt(rt, id.vars = "group", variable.name = "immuneCells",
               value.name = "Expression")
    df$group <- factor(df$group, levels=c('High','Low'))
    plot <- ggplot(df,aes(x=immuneCells, y=Expression, fill=group))+
      geom_boxplot(aes(fill=group), outlier.shape = NA)+
      geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+ 
      #geom_signif(comparisons = compaired, map_signif_level = T)+
      stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+
      theme_pubclean() +
      scale_x_discrete(name="")+  
      theme(plot.title = element_text(size = 13, face = "bold"),
            legend.position = "right",
            text = element_text(size = 13),
            axis.title = element_text(face="bold"),
            axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0)) 
    plot
    

    4. 套路四:以单基因高低表达分组,limma差异分析

    df <- exprs["Your Gene",]
    df <- as.data.frame(t(df))
    df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")
    table(df$group)
    
    library(limma)
    design.factor <- factor(df$group, levels=c('Low','High')) 
    design.matrix <- model.matrix(~0+design.factor)
    colnames(design.matrix) <- levels(design.factor)
    design.matrix
    
    fit <- lmFit(exprs, design.matrix)
    cont.matrix <- makeContrasts(High-Low, levels=design.matrix)
    fit2 <- contrasts.fit(fit, cont.matrix)
    fit2 <- eBayes(fit2)
    DEG <- topTable(fit2,adjust="fdr",sort.by="B",number=Inf)
    head(DEG)
    DEG_filter <-subset(DEG, abs(logFC)>=1 & adj.P.Val<0.05)
    dim(DEG_filter)
    

    5. 套路五:GO、KEGG差异分析

    library(DOSE)
    library(GO.db)
    library(org.Hs.eg.db)
    library(GSEABase)
    library(clusterProfiler)
    
    ## symbol ID transform to entre id
    symbol=as.character(rownames(DEG_filter)) 
    eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
    id = as.character(eg[,2])
    head(id)
    
    ## MF
    egomf <- enrichGO(gene = id,
                      OrgDb = org.Hs.eg.db,
                      ont = "MF",
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05,
                      readable = TRUE)
    barplot(egomf, showCategory=15)
    dotplot(egomf)
    emapplot(egomf)
    write.csv(egomf@result,"egomf.csv") 
    
    ## CC
    egocc <- enrichGO(gene = id,
                      OrgDb = org.Hs.eg.db,
                      ont = "CC",
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05,
                      readable = TRUE)
    barplot(egocc, showCategory=15)
    dotplot(egocc)
    write.csv(egocc@result,"egocc.csv") 
    
    ## BP
    egobp <- enrichGO(gene = id,
                      OrgDb = org.Hs.eg.db,
                      ont = "BP",
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05,
                      readable = TRUE)
    barplot(egobp, showCategory=15)
    dotplot(egobp)
    emapplot(egobp)
    write.csv(egobp@result,"egobp.csv") 
    
    kk <- enrichKEGG(gene = id,
                     organism = 'hsa',
                     pvalueCutoff = 0.05,
                     pAdjustMethod = "BH",
                     qvalueCutoff = 0.05)
    barplot(kk, showCategory=15)
    dotplot(kk)
    kk <- setReadable(kk, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
    write.csv(kk@result,"kegg.csv")
    browseKEGG(kk, 'hsa04972')
    

    6. 套路六:GSEA

    6.1 对limma差异分析的结果,进行整理

    symbol=as.character(rownames(DEG_filter)) 
    eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
    id = as.character(eg[,2])
    head(id)
    is.data.frame(eg)
    gene <- dplyr::distinct(eg, SYMBOL,.keep_all=TRUE)
    
    DEG_filter$SYMBOL <- rownames(DEG_filter)
    data_all_sort <- DEG_filter %>% 
      dplyr::inner_join(gene, by="SYMBOL") %>%
      arrange(desc(logFC))
    head(data_all_sort)
    
    geneList = data_all_sort$logFC #把foldchange按照从大到小提取出来
    names(geneList) <- data_all_sort$ENTREZID #给上面提取的foldchange加上对应上ENTREZID
    head(geneList)
    
    ge = DEG_filter$logFC
    names(ge) = rownames(DEG_filter)
    ge = sort(ge,decreasing = T)
    head(ge)
    

    6.2 GSEA

    library(msigdbr)
    GO_df = msigdbr(species = "Homo sapiens",category = "C5") %>% 
      dplyr::select(gene_symbol,gs_exact_source,gs_subcat)
    dim(GO_df)
    length(unique(KEGG_df$gs_exact_source)) # 通路数量
    
    # GSEA
    library(GSVA)
    em <- GSEA(ge, TERM2GENE = GO_df)
    gseaplot2(em, geneSetID = 1, title = em$Description[1])
    

    7. 套路七:OncoScore文本挖掘,评分

    8. 套路八:与重要基因的相关性分析(如免疫检查点基因)

    9. 套路九:与单细胞测序数据结合

    年后分享!

    10. 套路十:oncomine,TIME,GEPIA等在线网站的使用

    前九件套小编主要用于非TCGA数据的分析。

    相关文章

      网友评论

        本文标题:【年末福利】以单基因生信分析切入,分享ssgsva、GSEA等套

        本文链接:https://www.haomeiwen.com/subject/gzrvhrtx.html