美文网首页生物信息学R语言源码单细胞测序专题集合
单细胞功能集GSVA以及转录调控SCENIC分析流程

单细胞功能集GSVA以及转录调控SCENIC分析流程

作者: 小光amateur | 来源:发表于2020-03-20 16:34 被阅读0次
    image

    基因集变异分析(GSVA)

    方法:

    基因集变异分析在传统转录组分析时就已经使用了,针对单细胞数据,其实是一样的。首先通过GSVA函数将细胞-基因表达矩阵转换为细胞-基因集表达矩阵,然后针对该矩阵计算两组或者多组之间的差异,并通过热图展示该差异。

    单细胞的分组可能有多个层面,例如不同细胞活着亚类分组,对照组和实验组,不同时间段的样本等等。因此最好先按照不同的分组将表达量矩阵和细胞分组信息提取出来,以便后续分析

    需要的R包:

    • GSEAbase
    • GSVA
    • limma
    • Seurat
    • Biobase
    • genefilter
    • dplyr
    • pystr (git)
    • purrr
    • tidyr
    • tibble

    提取不同分组的表达矩阵:

    相同细胞分类下不同时间样本的表达矩阵和细胞分组

    library(Seurat)
    library(dplyr)
    ##我这里有7个cluster,5个时间阶段
    for (i in 1:7){
        name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
        endo<-readRDS(name)
        time<-endo@meta.data$case
        time[which(grepl("^pf_",time))]<-"PF"
        time[which(grepl("^po3_",time))]<-"PO3"
        time[which(grepl("^po7_",time))]<-"PO7"
        time[which(grepl("^po11_",time))]<-"PO11"
        time[which(grepl("^rif_",time))]<-"RIF"
        endo@meta.data$time<-time
        endo[['Cell.type']]<-endo[['seurat_clusters']]
        endo2<-subset(endo,subset=time%in%c("PF","PO3","PO7","PO11"))
        for (cls in unique(endo2$Cell.type)){
            exam<-subset(endo2,idents=cls)
            DefaultAssay(exam)<-"integrated"
            exprMat<-GetAssayData(exam,slot="data")
            exprMat<-as.matrix(exprMat)
            cellInfo<-exam@meta.data%>%select(Cell.type,time)
            write.table(exprMat,paste0("./cluster_",i,"/cluster_",i,"_",cls,".exprMat_.txt"),sep="\t",quote=FALSE)
            write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,"_",cls,".cellInfo_.txt"),sep="\t",quote=FALSE)
    }}
    

    相同时间不同细胞的矩阵和信息

    library(Seurat)
    library(dplyr)
    for (i in 1:7){
        name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
        endo<-readRDS(name)
        time<-endo@meta.data$case
        time[which(grepl("^pf_",time))]<-"PF"
        time[which(grepl("^po3_",time))]<-"PO3"
        time[which(grepl("^po7_",time))]<-"PO7"
        time[which(grepl("^po11_",time))]<-"PO11"
        time[which(grepl("^rif_",time))]<-"RIF"
        endo@meta.data$time<-time
        endo[['Cell.type']]<-endo[['seurat_clusters']]
        for (ti in c("PF","PO3","PO7","PO11")){
            exam<-subset(endo,subset=time==ti)
            DefaultAssay(exam)<-"integrated"
            exprMat<-GetAssayData(exam,slot="data")
            exprMat<-as.matrix(exprMat)
            cellInfo<-exam@meta.data%>%select(Cell.type,time)
            write.table(exprMat,paste0("./cluster_",i,"/cluster_",i,".exprMat_",ti,".txt"),sep="\t",quote=FALSE)
            write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,".cellInfo_",ti,".txt"),sep="\t",quote=FALSE)
    }
    }
    

    去GSEA官网下载基因集合gmt文件

    GSEA Download

    这里有多个基因集,具体的介绍参考:
    https://www.gsea-msigdb.org/gsea/msigdb/index.jsp

    整合流程完成GSEA分析和差异分析

    一些注意事项:

    接着,做GSVA分析,获取GSVA矩阵,做GSVA分析可以参考传统bulk-RNA分析的方法

    这里还需要注意几个问题:

    • 处理普通RNA数据需要预先过滤,但是单细胞数据取自Seurat对象,已经预先过滤好了
    • 如果输入是原始counts值,需要设置参数kcdf="Possion",但如果是TPM值,默认就好,因为我们输入是标准化后的数据,所以用默认参数
    • 默认参数mx.diff=TURE,结果是一个类似于负二项分布,因为后面要做差异分析,所以需要使用该参数,如果设置mx.diff=FALSE,则为高斯分布

    先计算相同时间不同细胞的基因集差异

    library(GSEABase)
    library(Biobase)
    library(genefilter)
    library(limma)
    library(RColorBrewer)
    library(GSVA)
    library(dplyr)
    library(pystr)
    ##基因集这里看你用哪个了
    geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c7.all.v7.0.symbols.gmt")
    #geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c3.tft.v7.0.symbols.gmt")
    ###因为分析的是亚类,这里是针对细胞大类,意思是这
    ###里针对的是细胞大类1的所有亚类进行的分析
    clus<-1
    ###定义不同时间点
    time<-c("PF","PO3","PO7","PO11")
    ###输入一个列表或向量,进行重新编组
    re_grp<-function(x,s){
        purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
        }
    ##计算差异并返回差异数据框,x=分组信息,cellInfo=细胞分类信息(必须是数据框,
    ##行名为细胞,列名为Cell.type,里面是分组信息),res_es=GSVA表达矩阵,
    ##num=想要留下top-n个基因集
    cal_Diff<-function(x,cellInfo,res_es,num){
        need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
        grouP<-as.factor(need)
        desigN<-model.matrix(~grouP+0)
        rownames(desigN)<-rownames(cellInfo)
        comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
        fiT <- lmFit(res_es, desigN)
        fiT2 <- contrasts.fit(fiT, comparE)
        fiT3 <- eBayes(fiT2)
        Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
        if (nrow(Diff)>0){
            Diff$cluster<-x
            Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
        }else{
            Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
        }
    }
    ###计算不同分组的平均表达量,mat=表达量矩阵,info=cellInfo
    Average_express<-function(mat,info){
        new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
        new_cellinfo<-info
        new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
        all<-left_join(new_res,new_cellinfo,by="cell")
        fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
        fuck
    }
    for(i in time){
        expMat<- read.table(pystr_format("cluster_{1}.exprMat_{2}.txt",clus,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
        cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_{2}.txt",clus,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
        mydata<-as.matrix(expMat)
        all_cell<-unique(cellInfo$Cell.type)
        res_es <- gsva(mydata, geneSets, min.sz=10, max.sz=500, verbose=FALSE, parallel.sz=8)
        res_es<-as.data.frame(res_es)
        write.table(res_es,pystr_format("./cell_diff/cluster_{1}_{2}_c7.txt",clus,i),sep="\t",quote=FALSE)
        fin<-do.call('rbind',lapply(all_cell,cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10))
        fin<-fin%>%filter(!is.na(cluster))
        write.table(fin,pystr_format("./cell_diff/cluster_{1}_{2}_c7_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
        heihei2<-Average_express(res_es,cellInfo)
        heihei_need<-filter(heihei2,Name%in%unique(fin$Name))
        if (nrow(heihei_need)>0){
          houhou2<-tibble::column_to_rownames(heihei_need,var="Name")
          pheatmap::pheatmap(houhou2,file=pystr_format("./cell_diff/cluster_{1}_{2}_c7_diff_heat.pdf",clus,i),fontsize=8)
        }else{
          next
        }
    }
    

    再计算相同细胞不同时间的差异基因集

    library(GSEABase)
    library(Biobase)
    library(genefilter)
    library(limma)
    library(RColorBrewer)
    library(GSVA)
    library(dplyr)
    library(pystr)
    geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c7.all.v7.0.symbols.gmt")
    #geneSets <- getGmt("/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/liangxue_data/cluster_1/GSVA/DataBase/c3.tft.v7.0.symbols.gmt")
    ##clus是细胞亚类分型,这个不能多写
    clus<-c(0,1,2,3)
    ##细胞大类的名字
    big_cls<-1
    re_grp<-function(x,s){
        purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
        }
    cal_Diff<-function(x,cellInfo,res_es,num){
        need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
        grouP<-as.factor(need)
        desigN<-model.matrix(~grouP+0)
        rownames(desigN)<-rownames(cellInfo)
        comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
        fiT <- lmFit(res_es, desigN)
        fiT2 <- contrasts.fit(fiT, comparE)
        fiT3 <- eBayes(fiT2)
        Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
        if (nrow(Diff)>0){
            Diff$cluster<-x
            Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
        }else{
            Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
        }
    }
    Average_express<-function(mat,info){
        new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
        new_cellinfo<-info 
        new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
        all<-left_join(new_res,new_cellinfo,by="cell")
        fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
        fuck
    }
    for(i in clus){
        expMat<- read.table(pystr_format("cluster_{1}_{2}.exprMat_.txt",big_cls,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
        cellInfo<-read.table(pystr_format("cluster_{1}_{2}.cellInfo_.txt",big_cls,i),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
        colnames(cellInfo)<-c("haha","Cell.type")
        all_time<-unique(cellInfo$Cell.type)
        mydata<-as.matrix(expMat)
        res_es <- gsva(mydata, geneSets, min.sz=10, max.sz=500, verbose=FALSE, parallel.sz=8)
        res_es<-as.data.frame(res_es)
        write.table(res_es,pystr_format("./time_diff/cluster_{1}_{2}_c7.txt",big_cls,i),sep="\t",quote=FALSE)
        fin<-do.call('rbind',lapply(all_time,cal_Diff,cellInfo=cellInfo,res_es=res_es,num=10))
        fin<-fin%>%filter(!is.na(cluster))
        write.table(fin,pystr_format("./time_diff/cluster_{1}_{2}_c7_diff_geneset.txt",big_cls,i),sep="\t",quote=FALSE,row.names=FALSE)
        heihei2<-Average_express(res_es,cellInfo)
        heihei_need<-filter(heihei2,Name%in%unique(fin$Name))
        if (nrow(heihei_need)>0){
          houhou2<-tibble::column_to_rownames(heihei_need,var="Name")
          pheatmap::pheatmap(houhou2,file=pystr_format("./time_diff/cluster_{1}_{2}_c7_diff_heat.pdf",big_cls,i),fontsize=8)
        }else{
          next
        }
    }
    

    经过以上两轮的分析,你将每个层面得到三个结果

    • GSVA富集分数矩阵,正态分布
    • 每次比对的差异Top10基因集
    • 一个多组差异热图

    转录调控分析(SCENIC)

    按照SCENIC的官方教程,跑完整个流程(这里耗时很长,所以不能直接跑,要提交作业或者screen,避免中间间断
    SCENIC 基因互作数据库需要提前下载,人类的一共两个,每个1.2G
    hg19-500bp-upstream-7species.mc9nr.feather
    hg19-tss-centered-10kb-7species.mc9nr.feather

    由于R-Genie计算网络需要随机森林,所以耗时非常长,一个6K细胞的矩阵大概需要数天,所以,我们分三步计算

    • 提取表达量矩阵和细胞分组信息
    • 计算SCENIC网络
    • 计算差异并输出最后结果

    需要的R包

    • SCENIC
    • AUCell

    提取表达矩阵

    SCENIC 把整个矩阵一起打分,避免多次计算网络,消耗时间

    library(Seurat)
    library(dplyr)
    for (i in 1:7){
        name<-paste0("./cluster_",i,"/cluster_",i,".afterclu.rds")
        endo<-readRDS(name)
        time<-endo@meta.data$case
        time[which(grepl("^pf_",time))]<-"PF"
        time[which(grepl("^po3_",time))]<-"PO3"
        time[which(grepl("^po7_",time))]<-"PO7"
        time[which(grepl("^po11_",time))]<-"PO11"
        time[which(grepl("^rif_",time))]<-"RIF"
        endo@meta.data$time<-time
        endo[['Cell.type']]<-endo[['seurat_clusters']]
        exam<-subset(endo,subset=time%in%c("PF","PO3","PO7","PO11"))
        DefaultAssay(exam)<-"integrated"
        cellInfo<-exam@meta.data%>%select(Cell.type,time)
        exprMat<-GetAssayData(exam,slot="data")
        exprMat<-as.matrix(exprMat)
        write.table(exprMat,paste0("cluster_",i,".exprMat_noRIF.txt"),sep="\t",quote=FALSE)
        write.table(cellInfo,paste0("./cluster_",i,"/cluster_",i,".cellInfo_noRIF.txt"),sep="\t",quote=FALSE)
    }
    

    计算TF网络

    library(SCENIC)
    library(AUCell)
    library(pystr)
    ##填写不同的clus,对不同细胞进行分析
    clus<-1
    exprMat<-read.table(pystr_format("cluster_{1}.exprMat_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
    exprMat<-as.matrix(exprMat)
    cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
    cellInfo<-dplyr::select(cellInfo,Cell.type)
    colnames(cellInfo)<-"CellType"
    dir.create("int")
    saveRDS(cellInfo, file="int/cellInfo.Rds")
    org="hgnc"
    ##这个是你自己下载数据库存放的位置
    dbDir="/dellfsqd2/ST_LBI/USER/panxiaoguang/SingleCell/SCENIC/cisTarget_databases"
    myDatasetTitle="cluster_1_SCENIC_analysis"
    data(defaultDbNames)
    dbs <- defaultDbNames[[org]]
    scenicOptions <- initializeScenic(org=org, dbDir=dbDir, dbs=dbs, datasetTitle=myDatasetTitle, nCores=4)
    scenicOptions@inputDatasetInfo$cellInfo <- "int/cellInfo.Rds"
    saveRDS(scenicOptions, file="int/scenicOptions.Rds")
    exprMat_filtered <- exprMat
    runCorrelation(exprMat_filtered, scenicOptions)
    runGenie3(exprMat_filtered, scenicOptions)
    scenicOptions@settings$verbose <- TRUE
    scenicOptions@settings$nCores <- 4
    scenicOptions@settings$seed <- 123
    runSCENIC_1_coexNetwork2modules(scenicOptions)
    runSCENIC_2_createRegulons(scenicOptions)
    runSCENIC_3_scoreCells(scenicOptions, exprMat)
    

    批量计算针对每个细胞类型的不同分组的差异TF

    library(dplyr)
    library(SCENIC)
    library(AUCell)
    library(pystr)
    library(limma)
    re_grp<-function(x,s){
        purrr::map_chr(x,function(haha){if (haha==s){"A"}else{"B"}})
        }
    cal_Diff<-function(x,cellInfo,res_es,num){
        need<-cellInfo%>%transmute(new_group=re_grp(Cell.type,x))%>%.$new_group
        grouP<-as.factor(need)
        desigN<-model.matrix(~grouP+0)
        rownames(desigN)<-rownames(cellInfo)
        comparE<-makeContrasts(grouPA-grouPB,levels=desigN)
        fiT <- lmFit(res_es, desigN)
        fiT2 <- contrasts.fit(fiT, comparE)
        fiT3 <- eBayes(fiT2)
        Diff<-topTable(fiT3,p.value=0.05,coef=1,num=num)
        if (nrow(Diff)>0){
            Diff$cluster<-x
            Diff<-Diff%>%tibble::rownames_to_column(var="Name")%>%tbl_df()%>%select(Name,logFC,t,P.Value,adj.P.Val,cluster)
        }else{
            Diff<-tibble::tibble(Name=NA,logFC=NA,t=NA,P.Value=NA,adj.P.Val=NA,cluster=NA)
        }
    }
    Average_express<-function(mat,info){
        new_res<-mat%>%tibble::rownames_to_column(var="Name")%>%tidyr::gather(cell,value,-Name)
        new_cellinfo<-info
        new_cellinfo<-tibble::rownames_to_column(new_cellinfo,var="cell")
        all<-left_join(new_res,new_cellinfo,by="cell")
        fuck<-all%>%group_by(Name,Cell.type)%>%summarise(Mean=mean(value))%>%tidyr::spread(Cell.type,Mean)
        fuck
    }
    ###不同的clus针对不同的细胞类
    clus<-1
    scenicOptions <- readRDS("int/scenicOptions.Rds")
    regulons <- loadInt(scenicOptions, "regulons")
    regulons2 <- loadInt(scenicOptions, "aucell_regulons")
    regulonAUC <- loadInt(scenicOptions, "aucell_regulonAUC")
    regulonAUC <- regulonAUC[onlyNonDuplicatedExtended(rownames(regulonAUC)),]
    cellInfo<-read.table(pystr_format("cluster_{1}.cellInfo_noPO7.txt",clus),sep="\t",header=TRUE,row.names=1,check.names=FALSE,stringsAsFactors=FALSE)
    for (i in c("PF","PO3","PO7","PO11")){
        cellInfo2<-cellInfo[which(cellInfo$time==i),]
        exp<-as.data.frame(getAUC(regulonAUC))[,rownames(cellInfo2)]
        exp2<-t(scale(t(exp), center = T, scale=T))
        fin<-do.call('rbind',lapply(unique(cellInfo2$Cell.type),cal_Diff,cellInfo=cellInfo2,res_es=exp2,num=10))
        fin<-fin%>%filter(!is.na(cluster))
        write.table(fin,pystr_format("cluster_{1}_{2}_TF_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
        haha<-Average_express(exp,cellInfo2)%>%tibble::column_to_rownames(var="Name")%>%as.data.frame()
        haha<-haha[unique(fin$Name),]
        if (nrow(haha)>0){
            haha_scale<-t(scale(t(haha), center = T, scale=T))
            pheatmap::pheatmap(haha_scale,file=pystr_format("cluster_{1}_{2}_TF_heatmap.pdf",clus,i))
        }else{
            next
        }
        all_tf<-t(purrr::map_dfr(regulons2[unique(fin$Name)],function(x) {stringr::str_c(x,collapse=",")}))
        write.table(all_tf,pystr_format("cluster_{1}_{2}_TF_duiying.txt",clus,i),sep="\t",quote=FALSE,col.names=FALSE)
    }
    for (i in unique(cellInfo$Cell.type)){
        cellInfo2<-cellInfo[which(cellInfo$Cell.type==i),]
        colnames(cellInfo2)<-c("haha","Cell.type")
        exp<-as.data.frame(getAUC(regulonAUC))[,rownames(cellInfo2)]
        exp2<-t(scale(t(exp), center = T, scale=T))
        fin<-do.call('rbind',lapply(unique(cellInfo2$Cell.type),cal_Diff,cellInfo=cellInfo2,res_es=exp2,num=10))
        fin<-fin%>%filter(!is.na(cluster))
        write.table(fin,pystr_format("cluster_{1}_{2}_TF_diff_geneset.txt",clus,i),sep="\t",quote=FALSE,row.names=FALSE)
        haha<-Average_express(exp,cellInfo2)%>%tibble::column_to_rownames(var="Name")%>%as.data.frame()
        haha<-haha[unique(fin$Name),]
        if (nrow(haha)>0){
            haha_scale<-t(scale(t(haha), center = T, scale=T))
            pheatmap::pheatmap(haha_scale,file=pystr_format("cluster_{1}_{2}_TF_heatmap.pdf",clus,i))
        }else{
            next
        }
        all_tf<-t(purrr::map_dfr(regulons2[unique(fin$Name)],function(x) {stringr::str_c(x,collapse=",")}))
        write.table(all_tf,pystr_format("cluster_{1}_{2}_TF_duiying.txt",clus,i),sep="\t",quote=FALSE,col.names=FALSE)
    }
    

    经过以上分析后,同样会得到三个文件:

    • 一个不同组间差异的top10转录因子基因集合
    • 一个不同分组的差异热图
    • 一个TF对应的所有基因表格

    相关文章

      网友评论

        本文标题:单细胞功能集GSVA以及转录调控SCENIC分析流程

        本文链接:https://www.haomeiwen.com/subject/ernjyhtx.html