美文网首页
使用WGCNA分析单细胞转录组数据(懒人版) 2023-05-1

使用WGCNA分析单细胞转录组数据(懒人版) 2023-05-1

作者: 黄甫一 | 来源:发表于2023-05-18 09:49 被阅读0次

    背景介绍

    WGCNA是权重基因共表达网络分析,是常规转录组和单细胞转录组的重要分析内容之一,能找到与表型相关联的基因模块,所谓基因模块其实就是基因集,不过在WGCNA中是有共表达关系的基因集。本篇博客介绍WGCNA在单细胞转录组中的应用
    (这js是不是还有字数限制啊,写了三遍了,一发布就丢失了保存的内容,呜呜呜,一直保存不了,有毒啊,而且越来愈多广告了!嗐~)


    主函数seurat_wgcna

    obj是Seurat对象,
    group.by计算假细胞的列名,
    size计算假细胞使用的细胞数,例如设置为10表示将10个细胞合并成一个假细胞,
    slot使用的数据槽,默认为'data',一般不用改,
    assay使用的assay,默认为'RNA',
    fun计算假细胞使用的计算函数,默认为mean,
    WGCNA的参数,一般不用改type = "unsigned",corType = "pearson", pct.mad=0.75,
    label输出文件的标签,默认为'test',
    trait性状信息,默认为NULL,即meta.data,
    使用线程数nThreads,默认为10。

    • 使用实例
    obj <- readRDS('test.rds')
    seurat_wgcna(obj,group.by='celltype',size=100,slot='data',assay='Spatial',fun=mean,
                             type = "unsigned",corType = "pearson", pct.mad=0.75, label='test,trait=NULL,nThreads=20)
    

    所有代码如下

    基本函数

    library(Seurat)
    library(data.table)
    library(dplyr)
    
    condense_cell <- function(obj,group.by=NULL,size=NULL,slot='data',assay='RNA',fun=mean){
    mat <- data.table(t(as.matrix(slot(obj@assays[[assay]],slot))),keep.rownames=T)
    meta <- obj@meta.data
    meta$Celltype <- meta[,group.by]
    
    cellist <- rownames(meta)
    
    lcod <- c()
    lscell <- c()
    for (i in unique(meta$Celltype)) {
    c1 <- which(meta$Celltype==i)
    cellist1 <- cellist[c1]
    cod <- floor(seq_along(cellist1)/size)
    t1 <- data.frame(table(cod))
    t1[,1] <- as.vector(t1[,1])
    t1[,2] <- as.vector(t1[,2])
    c2 <- which(t1[,2]<size)
    short <-t1[,1][c2]
    c3 <- which(cod %in% short)
    cod[c3] <- t1[,1][which(t1[,2]>=size)[1]]
    cod <- paste0(i,"_Cell",cod)
    scell <- sample(cellist1)
    lcod <- c(lcod,cod)
    lscell <- c(lscell,scell)
    }
    
    map_df <- data.frame(lscell,lcod)
    colnames(map_df) <- c("old_id","new_id")
    rownames(map_df) <- map_df$old_id
    map_df <- map_df[mat$rn,]
    mat$rn <- map_df$new_id
    
    mat <- mat[,lapply(.SD,fun), by=rn]
    rownames(mat) <- mat$rn
    gename <- colnames(mat)
    celname <- rownames(mat)
    mat <- t(mat[,rn:=NULL])
    colnames(mat) <- celname
    rownames(mat) <- gename
    obj <- CreateSeuratObject(mat)
    return(obj)
    }
    
    library(igraph)
    plot_network <- function(edges,nodes=NULL,group.by=NULL,weight=NULL,coul = NULL,layout=layout.sphere,pf=NULL,main=""){
    if (!is.null(weight)) {
    edges$weight <- edges[,weight]
    }else{
    edges$weight <- 1
    }
    if (!is.null(group.by)) {
    nodes$group <- nodes[,group.by]
    }else{
    nodes$group <- 'group1'
    }
    network <- graph_from_data_frame(d=edges, vertices=nodes, directed=F)
    library(RColorBrewer)
    nlen <- length(unique(V(network)$group))
    
    if (is.null(coul)) {
    library(RColorBrewer)
    coul  <- c(brewer.pal(8, "Dark2"),brewer.pal(12, "Paired"),brewer.pal(8, "Set2"),brewer.pal(9, "Set1"))
    }
    coul <- coul[1:nlen]
    vertex.color <- coul[as.numeric(as.factor(V(network)$group))]
    
    pdf(paste0(pf,'_',group.by,'_network.pdf'),18,10)
    plot(network, vertex.color=vertex.color, edge.width=E(network)$weight*2,layout=layout,main=main)
    legend("topright", legend=levels(as.factor(V(network)$group)),
           col = coul, bty = "n", pch=20 , pt.cex = 3,
           cex = 1.5, text.col=coul , horiz = FALSE,
           inset = c(0.1, 0.1))
    dev.off()
    }
    
    

    WGCNA分析流程函数

    library(WGCNA)
    library(reshape2)
    library(stringr)
    options(stringsAsFactors = FALSE)
    enableWGCNAThreads()
    
    run_wgcna <- function(mat,type = "unsigned",corType = "pearson", pct.mad=0.75, label='test',trait=NULL,tom=F,RsquaredCut=0.85,hug.top=50,power=NULL){
    corFnc = ifelse(corType=="pearson", cor, bicor)
    maxPOutliers = ifelse(corType=="pearson",1,0.05)
    robustY = ifelse(corType=="pearson",T,F)
    
    dataExpr <- mat
    
    #Data QC
    ##################################################
    m.mad <- apply(dataExpr,1,mad)
    dataExprVar <- as.matrix(dataExpr[which(m.mad > max(quantile(m.mad, probs=seq(0, 1, 1-pct.mad))[2],0.01)),])
    dataExpr <- as.data.frame(t(dataExprVar))
    
    gsg = goodSamplesGenes(dataExpr, verbose = 3)
    
    if (!gsg$allOK){
      if (sum(!gsg$goodGenes)>0)
        printFlush(paste("Removing genes:",
                         paste(names(dataExpr)[!gsg$goodGenes], collapse = ",")));
      if (sum(!gsg$goodSamples)>0)
        printFlush(paste("Removing samples:",
                         paste(rownames(dataExpr)[!gsg$goodSamples], collapse = ",")));
      dataExpr = dataExpr[gsg$goodSamples, gsg$goodGenes]
    }
    
    nGenes = ncol(dataExpr)
    nSamples = nrow(dataExpr)
    
    sampleTree = hclust(dist(dataExpr), method = "average")
    pdf('fig1.sampleTree.pdf',18,10)
    plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
    dev.off()
    ##################################################
    
    #calculate power
    ##################################################
    powers = c(c(1:10), seq(from = 12, to=30, by=2))
    sft = pickSoftThreshold(dataExpr, powerVector=powers,networkType=type, verbose=5, RsquaredCut=RsquaredCut)
    
    pdf('fig2.powers.pdf',18,10)
    par(mfrow = c(1,2))
    cex1 = 0.9
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         xlab="Soft Threshold (power)",
         ylab="Scale Free Topology Model Fit,signed R^2",type="n",
         main = paste("Scale independence"))
    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         labels=powers,cex=cex1,col="red")
    abline(h=0.85,col="red")
    plot(sft$fitIndices[,1], sft$fitIndices[,5],
         xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
         main = paste("Mean connectivity"))
    text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
         cex=cex1, col="red")
    dev.off()
    saveRDS(sft,'sft.rds')
    saveRDS(dataExpr,'dataExpr.rds')
    if (is.null(power)) {
    power = sft$powerEstimate
    }
    print(paste0('The power value is : ',power))
    if (is.na(power)){
      power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
              ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
              ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
              ifelse(type == "unsigned", 6, 12))
              )
              )
    }
    ##################################################
    
    #bulid co-expression network
    ##################################################
    net = blockwiseModules(dataExpr, power = power, maxBlockSize = nGenes,
                           TOMType = type, minModuleSize = 30,
                           reassignThreshold = 0, mergeCutHeight = 0.25,
                           numericLabels = TRUE, pamRespectsDendro = FALSE,
                           saveTOMs=TRUE, corType = corType,
                           maxPOutliers=maxPOutliers, loadTOMs=TRUE,
                           saveTOMFileBase = paste0(label, ".tom"),
                           verbose = 3)
    
    moduleLabels = net$colors
    moduleColors = labels2colors(moduleLabels)
    
    pdf('fig3.plotDendroAndColors.pdf',18,10)
    plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
                        "Module colors",
                        dendroLabels = FALSE, hang = 0.03,
                        addGuide = TRUE, guideHang = 0.05)
    dev.off()
    
    MEs = net$MEs
    MEs_col = MEs
    colnames(MEs_col) = paste0("ME", labels2colors(
      as.numeric(str_replace_all(colnames(MEs),"ME",""))))
    MEs_col = orderMEs(MEs_col)
    
    
    pdf('fig4.plotEigengeneNetworks.pdf',18,10)
    plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
                          marDendro = c(3,3,2,4),
                          marHeatmap = c(3,4,2,2), plotDendrograms = T,
                          xLabelsAngle = 90)
    dev.off()
    ##################################################
    #TOMplot
    #TOM = TOMsimilarityFromExpr(dataExpr, power=power, corType=corType, networkType=type)
    load(net$TOMFiles[1], verbose=T)
    TOM <- as.matrix(TOM)
    dissTOM = 1-TOM
    plotTOM = dissTOM^7
    diag(plotTOM) = NA
    
    if (tom) {
    pdf('fig5.TOMplot.pdf',18,10)
    TOMplot(plotTOM, net$dendrograms, moduleColors,
                    main = "Network heatmap plot, all genes")
    dev.off()
    }
    ##################################################
    #export network information
    ##################################################
    probes = colnames(dataExpr)
    dimnames(TOM) <- list(probes, probes)
    
    cyt = exportNetworkToCytoscape(TOM,
                 edgeFile = paste(label, ".edges.txt", sep=""),
                 nodeFile = paste(label, ".nodes.txt", sep=""),
                 weighted = TRUE, threshold = 0,
                 nodeNames = probes, nodeAttr = moduleColors)
    
    edges <- read.table(paste(label, ".edges.txt", sep=""),sep='\t',header=T)
    nodes <- read.table(paste(label, ".nodes.txt", sep=""),sep='\t',header=T)
    
    library(RColorBrewer)
    coul<-c(brewer.pal(8, "Dark2"),brewer.pal(12, "Paired"),brewer.pal(8, "Set2"),brewer.pal(9, "Set1"))
    
    colnames(nodes) <- c(colnames(nodes)[1:2],'group')
    lgroup <- unique(nodes$group)
    lgroup <- lgroup[-grep('grey',lgroup)]
    for (i in lgroup) {
    nodes1 <- subset(nodes,group==i)
    c1 <- which(edges[,1] %in% nodes1[,1])
    edges1 <- edges[c1,]
    c1 <- which(edges1[,2] %in% nodes1[,1])
    edges1 <- edges1[c1,]
    mtitle <- dim(nodes1)[1]
    plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight='weight',coul = coul,layout=layout.sphere,pf=paste0("all_genes_",i),main=mtitle)
    }
    
    ##################################################
    #get hub genes
    ##################################################
    
    library(dplyr)
    edgeData1 <- cyt$edgeData[,c(1,2,3)]
    edgeData2 <- cyt$edgeData[,c(2,1,3)]
    nodeattrib <- cyt$nodeData[,c(1,3)]
    colnames(nodeattrib) <- c("nodeName", "Module")
    colnames(edgeData1) <- c("Node1","Node2","Weight")
    colnames(edgeData2) <- c("Node1","Node2","Weight")
    edgeData <- rbind(edgeData1, edgeData2)
    edgeData$Module1 <- nodeattrib[match(edgeData$Node1, nodeattrib$nodeName), 2]
    edgeData$Module2 <- nodeattrib[match(edgeData$Node2, nodeattrib$nodeName), 2]
    edgeData <- edgeData[edgeData$Module1==edgeData$Module2,c(1,3,4)]
    
    nodeTotalWeight <- edgeData %>% group_by(Node1, Module1) %>% summarise(weight=sum(Weight))
    nodeTotalWeight <- nodeTotalWeight[with(nodeTotalWeight, order(Module1, -weight)),]
    nodeTotalWeightTop = nodeTotalWeight %>% group_by(Module1) %>% top_n(hug.top, weight) %>% data.frame()
    write.table(nodeTotalWeightTop,'hug_gene_list.xls',sep='\t',quote=F)
    lgroup <- unique(nodeTotalWeightTop$Module1)
    lgroup <- lgroup[-grep('grey',lgroup)]
    
    for (i in lgroup) {
    nodes1 <- subset(nodeTotalWeightTop,Module1==i)
    c1 <- which(edges[,1] %in% nodes1[,1])
    edges1 <- edges[c1,]
    c1 <- which(edges1[,2] %in% nodes1[,1])
    edges1 <- edges1[c1,]
    
    c1 <- which(nodes[,1] %in% nodes1[,1])
    nodes1 <- nodes[c1,]
    nodes1 <- subset(nodes1,group==i)
    
    mtitle <- dim(nodes1)[1]
    plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight='weight',coul = coul,layout=layout.sphere,pf=paste0("hug_genes_",i),main=mtitle)
    }
    
    
    #calculate correlation between gene modules and trait
    
    if(!is.null(trait)) {
      traitData = trait
      sampleName = rownames(dataExpr)
      traitData = traitData[match(sampleName, rownames(traitData)), ]
    
    if (corType=="pearsoon") {
      modTraitCor = cor(MEs_col, traitData, use = "p")
      modTraitP = corPvalueStudent(modTraitCor, nSamples)
    } else {
      modTraitCorP = bicorAndPvalue(MEs_col, traitData, robustY=robustY)
      modTraitCor = modTraitCorP$bicor
      modTraitP   = modTraitCorP$p
    }
    
    textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
    dim(textMatrix) = dim(modTraitCor)
    pdf('fig6.labeledHeatmap.pdf',18,10)
    labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(traitData),
                   yLabels = colnames(MEs_col),
                   cex.lab = 0.5,
                   ySymbols = colnames(MEs_col), colorLabels = FALSE,
                   colors = blueWhiteRed(50),
                   textMatrix = textMatrix, setStdMargins = FALSE,
                   cex.text = 0.5, zlim = c(-1,1),
                   main = paste("Module-trait relationships"))
    dev.off()
    }
    ##################################################
    #save result
    ob1 <- list(corType,dataExpr, MEs_col, traitData, nSamples, net)
    save(corType,dataExpr, MEs_col, traitData, nSamples, net, file=paste0(label,'_result.RData'))
    return(ob1)
    }
    

    主函数

    seurat_wgcna <- function(obj,group.by=NULL,size=NULL,slot='data',assay='RNA',fun=mean,
                             type = "unsigned",corType = "pearson", pct.mad=0.75, label='test',trait=NULL,nThreads=10){
    enableWGCNAThreads(nThreads=nThreads)
    if (!is.null(size)) {
    obj <- condense_cell(obj,group.by=group.by,size=size,slot=slot,assay=assay,fun=fun)
    saveRDS(obj,'condense.rds')
    }
    trait <- obj@meta.data
    run_wgcna(mat=obj@assays$RNA@counts,type = type, corType = corType, pct.mad=pct.mad, label=label,trait=trait)
    }
    

    总结与讨论

    因为单细胞数据一般细胞数太多,如果把每个细胞当成一个样本会极其损耗计算资源,因此构建假细胞是一个较好的办法。还有很多细节没有具体展开,之后有时间再补充。

    相关文章

      网友评论

          本文标题:使用WGCNA分析单细胞转录组数据(懒人版) 2023-05-1

          本文链接:https://www.haomeiwen.com/subject/tzrisdtx.html