GSVA分析

作者: 夕颜00 | 来源:发表于2021-10-14 17:15 被阅读0次

    GSVA其实就是pathway级别的差异分析,根据GSVA的原理其实就是计算每个sample的GSEA然后得出类似pathway enrich score,把这个可以当作芯片的表达数据一样,再用limma包分析差异基因。
    参考以下两个例子:
    使用GSVA方法计算某基因集在各个样本的表现
    充分理解GSVA和GSEA
    以及我一个例子:

    rm(list=ls())
    suppressMessages(library(GSVA))
    suppressMessages(library(GSVAdata))
    suppressMessages(library(GSEABase))
    suppressMessages(library(limma))
    
    #读入gmt文件,这个可以从MSigDB上下载,这边选的上gene symbol根据自己的data来选择
    gmt_file="~/c5.bp.v7.1.symbols.gmt"
    geneset <- getGmt(gmt_file)  
    
    #自行读入exp文件,我这边为行为gene symbol,列为样本(4个control,5个treatment样本)
    es <- gsva(as.matrix(exp), geneset,
                        min.sz=10, max.sz=500, verbose=TRUE)
    #得到gsva计算的数值后再用limma包做差异分析得到差异的pathway
    design <- model.matrix(~ factor(c(rep("cont",4),rep("treatment",5))))
    colnames(design) <- c("ALL", "contvstreatment")
    row.names(design)<-colnames(exp)
    fit <- lmFit(es, design)
    fit <- eBayes(fit)
    #这边是总的
    allGeneSets <- topTable(fit, coef="contvstreatment", number=Inf)
    #这边是差异的
    adjPvalueCutoff <- 0.001
    DEgeneSets <- topTable(fit, coef="contvstreatment", number=Inf,
                           p.value=adjPvalueCutoff, adjust="BH")
    res <- decideTests(fit, p.value=adjPvalueCutoff)
    
    

    例子2

    gmt_file="~/Desktop/GoogleDrive/HPC/gmt/common/c5.bp.v7.1.symbols.gmt"
    geneset <- getGmt(gmt_file)  
    tpm_1st_pass<-read.csv(paste0(data_dir,"TPM_GenePass.csv"),row.names = 1)
    gsva_analysis<-function(sample,design,geneset,p,out_dir){
      exp<-get(sample)
      es <- gsva(as.matrix(exp), geneset,
                 min.sz=10, max.sz=500, verbose=TRUE)
      colnames(design) <- c("ALL", "ALvsFast")
      row.names(design)<-colnames(exp)
      fit <- lmFit(es, design)
      fit <- eBayes(fit)
      allGeneSets <- topTable(fit, coef="ALvsFast", number=Inf)
      DEgeneSets <- allGeneSets[allGeneSets$P.Value<p,]
      write.csv(es,paste0(out_dir_table,sample,"_GSVA_es_matrix.csv"))
      write.csv(allGeneSets,paste0(out_dir_table,sample,"_GSVA_degs_analysis_matrix.csv"))
      write.csv(DEgeneSets,paste0(out_dir_table,sample,"_GSVA_degs_sig_p_",as.character(p),"_analysis_matrix.csv"))
      output <- list(a = es, b = allGeneSets,c=DEgeneSets)
      return(output)
    }
    p=0.05
    design_1st <- model.matrix(~ factor(c(rep("AL",3),rep("Fast",3))))
    gsva_1st<-gsva_analysis(sample="tpm_1st_pass",design=design_1st,geneset=geneset,p=p,out_dir=out_dir_table)
    gsva_1st_es<-gsva_1st$a
    gsva_1st_degs<-gsva_1st$b
    gsva_1st_degs_sig<-gsva_1st$c
    
    

    包装为一个函数,使用sbatch,下面是mouse data转化为human gene symbol然后run gsva

    #!/usr/bin/env Rscript
    #use for mouse data to human gsva analysis
    #all data files and gmt files should be in the same folder
    #@author: CFJiang 04252020
    suppressMessages(library(argparse))
    rm(list = ls())
    options(stringsAsFactors=F)
    
    parser <- ArgumentParser(description="")
    
    parser$add_argument('-d', '--data_dir', metavar="*", help="path for your data")
    parser$add_argument('-f', '--data_filename', metavar="*", help="data file name")
    parser$add_argument('-p', '--prefix', metavar="*", help="prefix for your output")
    parser$add_argument('-g1', '--group1', metavar="*", help="group1 name")
    parser$add_argument('-g2', '--group2', metavar="*", help="group2 name")
    parser$add_argument('-t', '--type', metavar="*", help="type for either KEGG or GO")
    args <- parser$parse_args()
    if (is.null(args$type)) { args$type <-"KEGG" }
    
    #useage & example
    
    data_dir= as.character(args$data_dir)  #your data folder
    data_filename= as.character(args$data_filename) #your data file full name
    prefix= as.character(args$prefix) #your out put flile prefix
    group1= as.character(args$group1)  #your group1
    group2= as.character(args$group2)  #your group2
    type=as.character(args$type) #your analysis type (KEGG or GO)
    gsva_all_in_one<-function(data_dir,data_filename,prefix,group1,group2,type){
    rm(list = ls())
    options(stringsAsFactors=F)
    suppressMessages(library(GSVA))
    suppressMessages(library(GSEABase))
    suppressMessages(library(limma))
    suppressMessages(library(Hmisc))
    change_name<-function(rt){
      ann<-read.csv(paste0(data_dir,"/HHOM_MouseHumanSequence_mouse_human_combined.csv"),row.names = 1,header = T)
      com_id<-intersect(row.names(ann),row.names(rt))
      ann_com<-ann[com_id,]
      rt_com<-rt[com_id,]
      row.names(rt_com)<-as.character(ann_com$Symbol)
      return(rt_com)
    }
    
    #read your data .csv
    your_data<-read.csv(paste0(data_dir,"/",data_filename),header = T)
    your_data<-your_data[!duplicated(your_data[,1]),]
    row.names(your_data)<-as.character(your_data[,1])
    your_data<-your_data[,-1]
    #change the mouse gene name to human gene name
    new_rt<-change_name(your_data)
    #gsva analysis
    gsva_de<-function(data_dir,new_rt,prefix,group1,group2,type){
      new_dir<-paste0(data_dir,"/",prefix)
      dir.create(new_dir,recursive = T)
      setwd(new_dir)
      if (type=="KEGG"){
      gmt_file=paste0(data_dir,"/c2.cp.kegg.v7.1.symbols.gmt")
      }else{
      gmt_file=paste0(data_dir,"/c5.bp.v7.1.symbols.gmt") 
      }
      geneset <- getGmt(gmt_file) 
      es <- gsva(as.matrix(new_rt), geneset,
                 min.sz=10, max.sz=500, verbose=TRUE)
      gs<-paste0("(",as.character(group1),"|",as.character(group2),").*")
      level<-c(as.character(group1),as.character(group2))
      design <- model.matrix(~factor(gsub(gs, "\\1", colnames(new_rt)),levels = level))
      compare<-paste0(as.character(group1),"vs",as.character(group2))
      colnames(design)<-c("ALL", compare)
      row.names(design)<-colnames(new_rt)
      fit <- lmFit(es, design)
      fit <- eBayes(fit)
      allGeneSets <- topTable(fit, coef=compare, number=Inf)
      adjPvalueCutoff <- 0.05
      DEgeneSets <- topTable(fit, coef=compare, number=Inf,
                             p.value=adjPvalueCutoff, adjust="BH")
      write.csv(es,paste0(prefix,"_",compare,"_all_gsva_matrix_",type,".csv"))
      write.csv(allGeneSets,paste0(prefix,"_",compare,"_all_deg_gsva_",type,".csv"))
      write.csv(DEgeneSets,paste0(prefix,"_",compare,"_padj_",as.character(adjPvalueCutoff),"_deg_gsva_",type,".csv"))
    }
    gsva_de(data_dir=data_dir,new_rt = new_rt ,prefix = prefix ,group1 = group1,group2 = group2,type=type)
    }
    
    # data_dir= "~/Desktop/GSVA/ori_data"  #your data folder
    # data_filename= "nonCD45_T_count_test.csv" #your data file full name
    # prefix= "nonCD45_T"  #your out put flile prefix
    # group1= "FB6"  #your group1
    # group2= "FT2"  #your group2
    
    gsva_all_in_one(data_dir=data_dir,data_filename =data_filename,prefix = prefix,group1 = group1,group2=group2,type=type )
    
    

    shell脚本如下

    #! /bin/bash
    #SBATCH --time=10:00:00
    #SBATCH --cpus-per-task=10
    #SBATCH --mem=16g
    ml R/3.5
    data_dir=
    Rscript ./GSVA_mouse.R -d ${data_dir} -f nonCD45_T_count_test.csv -p nonCD45_T -g1 FB6 -g2 FT2 -t KEGG
    

    另外一个文章

    D11 2 理解GSVA 单个基因集的GSVA

    rm(list = ls())
    load(file = 'step2_select.Rdata')
    # 每次都要检测数据
    dat<- as.data.frame(t(M_expr))
    dat[1:4,1:4]  
    library(hgu133plus2.db)
    ls("package:hgu133plus2.db")
    ids=toTable(hgu133plus2SYMBOL)
    head(ids) 
    ids$symbol
    
    dat<- dat[rownames(dat)%in%ids$symbol,]
    
    table(rownames(dat) %in% ids$symbol)
    
    ids<- ids[ids$symbol%in%rownames(dat),]
    ids<- ids[!duplicated(ids$symbol),]
    
    ids$median=apply(dat,1,median)
    # 
    #   ids=ids[order(ids$symbol,ids$median,decreasing = T),]
    #   ids=ids[!duplicated(ids$symbol),]
    # dat=dat[ids$symbol,]
    # rownames(dat)=ids$symbol
    # dat[1:4,1:4]  
    Kmeans_results$group_list[Kmeans_results$`km.res$cluster`==1]<- c('cluster1')
    Kmeans_results$group_list[Kmeans_results$`km.res$cluster`==2]<- c('cluster2')
    group_list<- Kmeans_results$group_list
    X=dat
    ##############################################################################################
    X<- as.matrix(X)
    
    library(GSVA)
    
    
    
    options(stringsAsFactors = F)
    gene_Set <- read.delim("C:/Users/chenyuqiao/Desktop/LN RNAseq/Step GSVA/395gene GeneSet.txt", header=FALSE, row.names=1)
    gene_Set<- as.data.frame(t(gene_Set))
    gene_list<- as.list(gene_Set)
    
    es.max <- gsva(X, gene_list, mx.diff=FALSE, verbose=FALSE, parallel.sz=1)
    
    
    
    
    
    
    adjPvalueCutoff <- 0.05  #################这里要调整
    logFCcutoff <- log2(2)
    
      table(group_list)
      dim(es.max)
      design <- model.matrix(~0+factor(group_list))
      colnames(design)=levels(factor(group_list))
      rownames(design)=colnames(es.max)
      design
      library(limma)
      contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),
                                     levels = design)
      # contrast.matrix<-makeContrasts("TNBC-noTNBC",
      #                                levels = design)
      
      contrast.matrix ##这个矩阵声明,我们要把progres.组跟stable进行差异分析比较
      
     
        ##step1
        fit <- lmFit(es.max,design)
        ##step2
        fit2 <- contrasts.fit(fit, contrast.matrix) 
        ##这一步很重要,大家可以自行看看效果
        
        fit2 <- eBayes(fit2)  ## default no trend !!!
        ##eBayes() with trend=TRUE
        ##step3
        res <- decideTests(fit2, p.value=adjPvalueCutoff)
        summary(res)
        tempOutput = topTable(fit2, coef=1, n=Inf)
        nrDEG = na.omit(tempOutput) 
        #write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
        head(nrDEG)
        
        data_gene_set<- as.data.frame(es.max)
        
    
        nrDEG=nrDEG[nrDEG$P.Value<0.5 & abs(nrDEG$logFC) > 0.1,]   #######在这里调阈值,保证输出
        print(dim(nrDEG))
    
          n=rownames(nrDEG)
          match(n,rownames(nrDEG))
          data_gene_set=data_gene_set[match(n,rownames(data_gene_set)),]  ######n=rownames(nrDEG),筛选表达矩阵数据
          ac=data.frame(g=group_list)   ########对应后面的图中的annotation_col
          rownames(ac)=colnames(data_gene_set)
          # rownames(nrDEG)=substring(rownames(nrDEG),1,50)
          library(pheatmap)
          pheatmap(data_gene_set)
          dev.off
          pheatmap(data_gene_set)
          pheatmap::pheatmap(data_gene_set, 
                             fontsize_row = 8,height = 11,
                             annotation_col = ac,show_colnames = F,filename = '11111111111.pdf')
          
          # pheatmap::pheatmap(nrDEG, 
          #                    fontsize_row = 8,height = 11,
          #                    annotation_col = ac,show_colnames = F,
          #                    filename = '11111111111.pdf')
          # 
       
    

    转载来自:
    作者:落寞的橙子
    链接:https://www.jianshu.com/p/975adf808130
    作者:陈宇乔
    链接:https://www.jianshu.com/p/6af88970fcdf

    相关文章

      网友评论

        本文标题:GSVA分析

        本文链接:https://www.haomeiwen.com/subject/iejuoltx.html