美文网首页
WGCNA分析-单细胞转录组高阶分析01

WGCNA分析-单细胞转录组高阶分析01

作者: 一车小面包人 | 来源:发表于2023-09-10 20:06 被阅读0次

    背景:在umap聚类的基础上,进行WGCNA的分析。

    • WGCNA
      (1)两个部分:表达量聚类分析和表型关联
      (2)四个步骤:基因之间相关系数计算、基因模块的确定、共表达网络、模块与性状关联
    • 预处理:将原始细胞基因表达矩阵转换为伪细胞基因表达矩阵
      所谓的伪细胞就是将原来10(size=10)个细胞当作是1个细胞来分析,当然size是可以改变的。
      这里写一个过程:
    library(Seurat)
    library(dplyr)
    sc<-readRDS("./sc_umap.rds")
    sc<-sc
    #' 构建原始细胞名字和伪细胞名字的对照表格cell.id_table
    old_cell.id<-c()
    new_cell.id<-c()
    for(i in unique(sc$seurat_clusters)){
            sub_meta.data<-sc@meta.data[sc@meta.data$seurat_clusters==i,]
            sub_cell.id<-rownames(sub_meta.data)
            int_num<-floor(length(sub_cell.id)/10)
            remain_num<-length(sub_cell.id)%%10
    
            for(j in 1:int_num){
                    sub_new<-rep(paste0(i,"_cell_",j),10)
                    new_cell.id<-c(new_cell.id,sub_new)
            }
            sub_new<-rep(paste0(i,"_cell_",j+1),remain_num)
            new_cell.id<-c(new_cell.id,sub_new)
            old_cell.id<-c(old_cell.id,sample(sub_cell.id)) #'这里sample函数相当重要,它打乱了原始的细胞顺序,增加随机性
    }
    cell.id_table<-data.frame(new.id=new_cell.id,old.id=old_cell.id)
    head(cell.id_table) 
    
    cell.id_table.png
    table(cell.id_table$new.id)可以看见每个伪细胞里其实包含了原始的10个细胞:
    table.png
    #' 提取原始seurat对象的标准化表达矩阵
    t.counts<-data.table(t(as.matrix(slot(sc@assays[["RNA"]],"data"))),keep.rownames=T) #'这样提取避免行名是字符串
    rownames(cell.id_table)<-cell.id_table$old.id
    cell.id_table<-cell.id_table[t.counts$rn,]
    t.counts$rn<-cell.id_table$new.id
    t.counts<- t.counts[,lapply(.SD,mean), by=rn] #’根据rn那一列的值,合并基因表达量,mean代表使用平均表达量
    gene.name <- t.counts$rn
    cell.id <- rownames(t.counts)
    my.counts <- t(t.counts[,rn:=NULL])
    colnames(my.counts)<-cell.id)
    new.sc<- CreateSeuratObject(my.counts)
    saveRDS(new.sc,"pseudo_sc.rds")
    
    my.counts.png

    可以看到细胞名字已经变成了伪细胞的名字。

    • WGCNA分析
    library(WGCNA)
    options(stringsAsFactors = FALSE)
    enableWGCNAThreads() #' 打开多线程
    datExpr0<-as.data.frame(t(my.counts)) #' 行为伪细胞,列为基因
    #' 检查缺失值
    gsg = goodSamplesGenes(datExpr0, verbose = 3)
    gsg$allOK #' 这个值为TRUE就没问题,反之进行以下筛选
    if (!gsg$allOK){
      # Optionally, print the gene and sample names that were removed:
      if (sum(!gsg$goodGenes)>0)
        printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))
      if (sum(!gsg$goodSamples)>0)
        printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))
      # Remove the offending genes and samples from the data:
      datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
    }
    

    这里也许过滤了一些细胞或者基因,接下来进一步过滤基因:

    #' 过滤,根据阈值筛选基因列
    meanFPKM= 0.5  #' 过滤标准,可以修改
    n=nrow(datExpr0)
    datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean) #' 计算每一列基因的平均值
    datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM] #' 只保留平均值大于阈值的列
    # for meanFpkm in row n+1 and it must be above what you set--select meanFpkm>opt$meanFpkm(by rp)
    filtered_fpkm=t(datExpr0) #' 转置,变为行为基因,列为细胞
    filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm) #' 转换为数据框
    names(filtered_fpkm)[1]="sample"
    head(filtered_fpkm)
    write.table(filtered_fpkm, file="mRNA.symbol.uniq.filter.txt",
                row.names=F, col.names=T,quote=FALSE,sep="\t")
    
    nGenes = ncol(datExpr0)
    nSamples = nrow(datExpr0)
    
    head(filtered_fpkm).png

    接下来过滤一些细胞:

    #' 聚类,对伪细胞进行聚类,筛选伪细胞,去除聚类关系上较远的伪细胞
    sampleTree = hclust(dist(datExpr0), method = "average")
    pdf(file = "sampleClustering_fig1.pdf", width = 15, height = 8)
    par(cex = 0.6)
    par(mar = c(0,6,6,0))
    plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,
         cex.axis = 1.5, cex.main = 2)
    ### Plot a line to show the cut
    #abline(h = 180, col = "red")##剪切高度不确定,故无红线
    dev.off()
    
    sampleclustering.png

    接下来是较为关键的一步,寻找合适的软阈值power:

    #' 寻找软阈值power,power是对相关性的值进行幂次运算(所谓加权网络的边)的值
    #' abs(cor(geneX, geneY)) ^ power
    powers=c(c(1:10),seq(from = 12, to = 20, by = 2))
    sft = pickSoftThreshold(datExpr0, powerVector = powers, verbose = 5)
    #' 可视化power,拐点位置就是合适的值
    pdf('powers_fig2.pdf',18,10)
    par(mfrow = c(1,2))
    cex1 = 0.9
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         xlab="Soft Threshold (power)",
         ylab="Scale Free Topology Model Fit,signed R^2",type="n",
         main = paste("Scale independence"))
    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         labels=powers,cex=cex1,col="red")
    abline(h=0.85,col="red")
    plot(sft$fitIndices[,1], sft$fitIndices[,5],
         xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
         main = paste("Mean connectivity"))
    text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
         cex=cex1, col="red")
    dev.off()
    
    powers_fig2.png

    通常,选取在拐点处的值为软阈值,这里power=4。

    #' 获取预测的power值,如果没有则根据以下规则获取
    power = sft$powerEstimate
    if (is.na(power)){
      power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
              ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
              ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
              ifelse(type == "unsigned", 6, 12))
              )
              )
    }
    

    接下来计算基因模块:

    #' 确定模块modules
    #' TOMType = "unsigned"构建无尺度网络;minModuleSize = 30最小模块基因数为30;
    #' TOM矩阵(topological overlap matrix,TOM拓扑重叠矩阵):数值都在[0,1]间,
    #' 是用于反应两两基因共表达的相似度,值越高,说明共表达相似度越高
    cor = WGCNA::cor #' 指定使用WGCNA包中的相关性计算函数
    net = blockwiseModules(datExpr0, power = power, maxBlockSize = nGenes,
                           TOMType = "unsigned", minModuleSize = 30,
                           reassignThreshold = 0, mergeCutHeight = 0.25,
                           numericLabels = TRUE, pamRespectsDendro = FALSE,
                           saveTOMs=TRUE, corType = "pearson",
                           maxPOutliers=1, loadTOMs=TRUE,
                           saveTOMFileBase = paste0("first", ".tom"),
                           verbose = 3)
    cor = stats::cor
    table(net$colors)
    #' TOM矩阵的层次聚类及模块颜色结果可视化
    moduleLabels = net$colors
    moduleColors = labels2colors(moduleLabels) #' 将标签转换为颜色color
    pdf('modules_fig3.pdf',18,10)
    plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
                        "Module colors",
                        dendroLabels = FALSE, hang = 0.03,
                        addGuide = TRUE, guideHang = 0.05)
    dev.off()
    
    modules_fig3.png

    这个图呢,上面部分是基因聚类,下面颜色部分是不同的基因模块。
    接下来绘制校正矩阵的热图:

    #' 校正矩阵热图
    MEs = net$MEs
    MEs_col = MEs
    colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
    MEs_col = orderMEs(MEs_col)
    pdf('adjacency_fig4.pdf',18,10)
    plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
                          marDendro = c(3,3,2,4),
                          marHeatmap = c(3,4,2,2), plotDendrograms = T,
                          xLabelsAngle = 90)
    dev.off()
    
    adjacency_fig4.png

    模块与模块之间相关性可视化:

    #’ TOM矩阵热图可视化,模块与模块之间的相关性
    load(net$TOMFiles[1], verbose=T)
    TOM <- as.matrix(TOM)
    dissTOM = 1-TOM
    plotTOM = dissTOM^7
    diag(plotTOM) = NA
    pdf('TOM_fig5.pdf',18,10)
    TOMplot(plotTOM, net$dendrograms, moduleColors,
                    main = "Network heatmap plot, all genes")
    dev.off()
    
    TOM_fig5.png

    绘制每一个模块中关键基因的网状图:

    #' 模块里的关键基因
    probes = colnames(datExpr0)
    dimnames(TOM) <- list(probes, probes)
    cyt = exportNetworkToCytoscape(TOM,
                 edgeFile = paste("first", ".edges.txt", sep=""),
                 nodeFile = paste("first", ".nodes.txt", sep=""),
                 weighted = TRUE, threshold = 0,
                 nodeNames = probes, nodeAttr = moduleColors)
    #' 可视化每个模块的关键基因的网络图,去掉了grey颜色的模块
    edges <- read.table(paste("first", ".edges.txt", sep=""),sep='\t',header=T)
    nodes <- read.table(paste("first", ".nodes.txt", sep=""),sep='\t',header=T)
    library(RColorBrewer)
    coul<-c(brewer.pal(8, "Dark2"),brewer.pal(12, "Paired"),brewer.pal(8, "Set2"),brewer.pal(9, "Set1"))
    colnames(nodes) <- c(colnames(nodes)[1:2],'group')
    lgroup <- unique(nodes$group)
    lgroup <- lgroup[-grep('grey',lgroup)]
    for (i in lgroup){
    nodes1 <- subset(nodes,group==i)
    c1 <- which(edges[,1] %in% nodes1[,1])
    edges1 <- edges[c1,]
    c1 <- which(edges1[,2] %in% nodes1[,1])
    edges1 <- edges1[c1,]
    mtitle <- dim(nodes1)[1]
    plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight=NULL,coul = coul,layout=layout.sphere,pf=i,main=mtitle)
    }
    
    #' 筛选HUG基因,并绘制每个模块中的hug基因网络图
    library(dplyr)
    edgeData1 <- cyt$edgeData[,c(1,2,3)]
    edgeData2 <- cyt$edgeData[,c(2,1,3)]
    nodeattrib <- cyt$nodeData[,c(1,3)]
    colnames(nodeattrib) <- c("nodeName", "Module")
    colnames(edgeData1) <- c("Node1","Node2","Weight")
    colnames(edgeData2) <- c("Node1","Node2","Weight")
    edgeData <- rbind(edgeData1, edgeData2)
    edgeData$Module1 <- nodeattrib[match(edgeData$Node1, nodeattrib$nodeName), 2]
    edgeData$Module2 <- nodeattrib[match(edgeData$Node2, nodeattrib$nodeName), 2]
    edgeData <- edgeData[edgeData$Module1==edgeData$Module2,c(1,3,4)] #'同一个模块内的基因网络信息
    nodeTotalWeight <- edgeData %>% group_by(Node1, Module1) %>% summarise(weight=sum(Weight))
    nodeTotalWeight <- nodeTotalWeight[with(nodeTotalWeight, order(Module1, -weight)),]
    nodeTotalWeightTop = nodeTotalWeight %>% group_by(Module1) %>% top_n(10, weight) %>% data.frame()
    write.table(nodeTotalWeightTop,'hug_gene_list.xls',sep='\t',quote=F)
    lgroup <- unique(nodeTotalWeightTop$Module1)
    lgroup <- lgroup[-grep('grey',lgroup)]
    for (i in lgroup) {
    nodes1 <- subset(nodeTotalWeightTop,Module1==i)
    c1 <- which(edges[,1] %in% nodes1[,1])
    edges1 <- edges[c1,]
    c1 <- which(edges1[,2] %in% nodes1[,1])
    edges1 <- edges1[c1,]
    
    c1 <- which(nodes[,1] %in% nodes1[,1])
    nodes1 <- nodes[c1,]
    nodes1 <- subset(nodes1,group==i)
    mtitle <- dim(nodes1)[1]
    plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight='weight',coul = coul,layout=layout.sphere,pf=paste0("hug_genes_",i),main=mtitle)
    }
    

    接下来是将性状信息与基因模块关联起来的一些分析:

    #' 计算基因模块与性状相关性
    #' 需要性状相关的数据框信息
    trait=new.sc@meta.data #' 性状相关的数据框信息
    #trait=trait[,-1]
    corType = "pearson"
    robustY = ifelse(corType=="pearson",T,F)
    if(!is.null(trait)) {
      traitData = trait
      sampleName = rownames(datExpr0)
      traitData = traitData[match(sampleName, rownames(traitData)), ]
    
    if (corType=="pearsoon") {
      modTraitCor = cor(MEs_col, traitData, use = "p")
      modTraitP = corPvalueStudent(modTraitCor, nSamples)
    } else {
      modTraitCorP = bicorAndPvalue(MEs_col, traitData, robustY=robustY)
      modTraitCor = modTraitCorP$bicor
      modTraitP   = modTraitCorP$p
    }
    textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
    dim(textMatrix) = dim(modTraitCor)
    pdf('corMetaGene_fig6.pdf',18,10)
    labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(traitData),
                   yLabels = colnames(MEs_col),
                   cex.lab = 0.5,
                   ySymbols = colnames(MEs_col), colorLabels = FALSE,
                   colors = blueWhiteRed(50),
                   textMatrix = textMatrix, setStdMargins = FALSE,
                   cex.text = 0.5, zlim = c(-1,1),
                   main = paste("Module-trait relationships"))
    dev.off()
    }
    
    module='brown'
    pheno='orig.ident'
    if (corType=="pearsoon") {
      geneModuleMembership = as.data.frame(cor(datExpr0, MEs_col, use = "p"))
      MMPvalue = as.data.frame(corPvalueStudent(
                 as.matrix(geneModuleMembership), nSamples))
    }else {
      geneModuleMembershipA = bicorAndPvalue(datExpr0, MEs_col, robustY=robustY)
      geneModuleMembership = geneModuleMembershipA$bicor
      MMPvalue   = geneModuleMembershipA$p
    }
    
    if (corType=="pearsoon") {
      geneTraitCor = as.data.frame(cor(datExpr0, traitData, use = "p"))
      geneTraitP = as.data.frame(corPvalueStudent(
                 as.matrix(geneTraitCor), nSamples))
    } else {
      geneTraitCorA = bicorAndPvalue(datExpr0, traitData, robustY=robustY)
      geneTraitCor = as.data.frame(geneTraitCorA$bicor)
      geneTraitP   = as.data.frame(geneTraitCorA$p)
    }
    modNames = substring(colnames(MEs_col), 3)
    module_column = match(module, modNames)
    pheno_column = match(pheno,colnames(traitData))
    moduleGenes = moduleColors == module
    sizeGrWindow(7, 7)
    par(mfrow = c(1,1))
    pdf(paste0(module,"_",pheno,'_verboseScatterplot_fig7.pdf'),18,10)
    verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),
                       abs(geneTraitCor[moduleGenes, pheno_column]),
                       xlab = paste("Module Membership in", module, "module"),
                       ylab = paste("Gene significance for", pheno),
                       main = paste("Module membership vs. gene significance\n"),
                       cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
    dev.off()
    

    总结:是稍微复杂了一点。

    相关文章

      网友评论

          本文标题:WGCNA分析-单细胞转录组高阶分析01

          本文链接:https://www.haomeiwen.com/subject/dausvdtx.html