WGCNA

作者: 陈宇乔 | 来源:发表于2019-05-15 21:29 被阅读7次

    参考

    http://www.bio-info-trainee.com/2535.html
    WGCNA分析,简单全面的最新教程
    https://www.jianshu.com/p/f0409a045dab

    image.png
    image.png
    image.png
    library(WGCNA)
    
    options(stringsAsFactors = FALSE)
    # 打开多线程
    enableWGCNAThreads()
    load(file = './Rdata/step2_id_transed.Rdata')
    
    # 常规表达矩阵,log2转换后或
    # Deseq2的varianceStabilizingTransformation转换的数据
    # 如果有批次效应,需要事先移除,可使用removeBatchEffect
    # 如果有系统偏移(可用boxplot查看基因表达分布是否一致),
    # 需要quantile normalization
    
    exprSet<- log2(new_exprSet+1)
    pdata<- pdata[1:20,]
    exprSet_test<- exprSet[,1:20]
    # 官方推荐 "signed" 或 "signed hybrid"
    # 为与原文档一致,故未修改 
    type = "unsigned"
    
    
    # corFnc = ifelse(corType=="pearson", cor, bicor)
    # 对二元变量,如样本性状信息计算相关性时,
    # 或基因表达严重依赖于疾病状态时,需设置下面参数
    # maxPOutliers = ifelse(corType=="pearson",1,0.05)
    
    # 关联样品性状的二元变量时,设置
    # robustY = ifelse(corType=="pearson",T,F)
    
    
    dataExpr<- exprSet_test
    dataExpr<- exprSet
    dim(dataExpr)
    head(dataExpr)[,1:8]
    
    ######################## step2数据筛选
    ## 筛选中位绝对偏差前75%的基因,至少MAD大于0.01
    ## 筛选后会降低运算量,也会失去部分信息
    ## 也可不做筛选,使MAD大于0即可
    # m.mad <- apply(dataExpr,1,mad)
    # dataExprVar <- dataExpr[which(m.mad > max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
    # 
    # ## 转换为样品在行,基因在列的矩阵
    # dataExpr <- as.data.frame(t(dataExprVar))
    # 
    
    
    ## 因为WGCNA针对的是基因进行聚类,而一般我们的聚类是针对样本用hclust即可,所以这个时候需要转置。
    WGCNA_matrix = t(dataExpr[order(apply(dataExpr,1,mad), decreasing = T)[1:5000],])
    datExpr0 <- WGCNA_matrix  ## top 5000 mad genes
    dataExpr <- datExpr0 
    
    
    ## 检测缺失值
    gsg = goodSamplesGenes(dataExpr, verbose = 3)
    
    if (!gsg$allOK){
      # Optionally, print the gene and sample names that were removed:
      if (sum(!gsg$goodGenes)>0) 
        printFlush(paste("Removing genes:", 
                         paste(names(dataExpr)[!gsg$goodGenes], collapse = ",")));
      if (sum(!gsg$goodSamples)>0) 
        printFlush(paste("Removing samples:", 
                         paste(rownames(dataExpr)[!gsg$goodSamples], collapse = ",")));
      # Remove the offending genes and samples from the data:
      dataExpr = dataExpr[gsg$goodSamples, gsg$goodGenes]
    }
    nGenes = ncol(dataExpr)
    nSamples = nrow(dataExpr)
    
    dim(dataExpr)
    
    ########################### step2 软阈值筛选
    
    ## 查看是否有离群样品
    sampleTree = hclust(dist(dataExpr), method = "average")
    plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
    
    
    
    # 软阈值的筛选原则是使构建的网络更符合无标度网络特征。
    
    powers = c(c(1:10), seq(from = 12, to=30, by=2))
    sft = pickSoftThreshold(dataExpr, powerVector=powers, 
                            networkType=type, verbose=5)
    
    par(mfrow = c(1,2))
    cex1 = 0.9
    # 横轴是Soft threshold (power),纵轴是无标度网络的评估参数,数值越高,
    # 网络越符合无标度特征 (non-scale)
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         xlab="Soft Threshold (power)",
         ylab="Scale Free Topology Model Fit,signed R^2",type="n",
         main = paste("Scale independence"))
    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         labels=powers,cex=cex1,col="red")
    
    # 筛选标准。R-square=0.85
    abline(h=0.85,col="red")
    
    # Soft threshold与平均连通性
    plot(sft$fitIndices[,1], sft$fitIndices[,5],
         xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
         main = paste("Mean connectivity"))
    text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, 
         cex=cex1, col="red")
    power = sft$powerEstimate
    power
    
    
    # ### 经验power (无满足条件的power时选用)
    # # 无向网络在power小于15或有向网络power小于30内,没有一个power值可以使
    # # 无标度网络图谱结构R^2达到0.8,平均连接度较高如在100以上,可能是由于
    # # 部分样品与其他样品差别太大。这可能由批次效应、样品异质性或实验条件对
    # # 表达影响太大等造成。可以通过绘制样品聚类查看分组信息和有无异常样品。
    # # 如果这确实是由有意义的生物变化引起的,也可以使用下面的经验power值。
    # if (is.na(power)){
    #   power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
    #                  ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
    #                         ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
    #                                ifelse(type == "unsigned", 6, 12))       
    #                  )
    #   )
    # }
    
    
    
    
    
    ############## step3 网络构建
    ##一步法网络构建:One-step network construction and module detection##
    # power: 上一步计算的软阈值
    # maxBlockSize: 计算机能处理的最大模块的基因数量 (默认5000);
    #  4G内存电脑可处理8000-10000个,16G内存电脑可以处理2万个,32G内存电脑可
    #  以处理3万个
    #  计算资源允许的情况下最好放在一个block里面。
    # corType: pearson or bicor
    # numericLabels: 返回数字而不是颜色作为模块的名字,后面可以再转换为颜色
    # saveTOMs:最耗费时间的计算,存储起来,供后续使用
    # mergeCutHeight: 合并模块的阈值,越大模块越少
    
    
    ###### 20的内存都不够用,所以maxBlockSize只能选取前5000gene
    maxPOutliers=0
    ### 软阈值
    power=sft$powerEstimate
    # 相关性计算
    # 官方推荐 biweight mid-correlation & bicor
    # corType: pearson or bicor
    # 为与原文档一致,故未修改
    corType = "pearson"
    exprMat <- "WGCNA/LiverFemaleClean.txt"  
    ##### 这个很关键,需要创建WGCNA的文件夹,然后创建LiverFemaleClean的文本
    #否则报错 Error in gzfile(file, "wb") : cannot open the connection
    
    
    net = blockwiseModules(dataExpr, power = power,
                           # maxBlockSize = nGenes,
                           maxBlockSize = 6000,
                           TOMType = type, minModuleSize = 30,
                           reassignThreshold = 0, mergeCutHeight = 0.25,
                           numericLabels = TRUE, pamRespectsDendro = FALSE,
                           saveTOMs=TRUE, corType = corType,
                           maxPOutliers=maxPOutliers, loadTOMs=TRUE,
                           saveTOMFileBase = paste0(exprMat, ".tom"),  ##### 这个很关键,需要创建WGCNA的文件夹,然后创建LiverFemaleClean的文本
                           verbose = 3)
    ### 报错 Error in (new("standardGeneric", .Data = function (x, y = NULL, use = "everything",  :    unused arguments (weights.x = NULL, weights.y = NULL, cosine = FALSE)
    ## 解决方案:I think I tackle the problem. There is a conflict between the WGCNA and the other packages. the other package have a function the same as the other in WGCNA in the run r studio. when I library no packages other than WGCNA, the program runs well and get the module. Thanks for professor Kevin Blighe's advice.
    ## 也就是不要加载别的程序,cor这个命令可能和别的包相互冲突https://www.biostars.org/p/305714/
    
    # jimi版http://www.bio-info-trainee.com/2535.html
    # net = blockwiseModules(
    #   datExpr,
    #   power = sft$powerEstimate,
    #   maxBlockSize = 6000,
    #   TOMType = "unsigned", minModuleSize = 30,
    #   reassignThreshold = 0, mergeCutHeight = 0.25,
    #   numericLabels = TRUE, pamRespectsDendro = FALSE,
    #   saveTOMs = TRUE,
    #   saveTOMFileBase = "AS-green-FPKM-TOM",
    #   verbose = 3
    # )
    
    # 根据模块中基因数目的多少,降序排列,依次编号为 `1-最大模块数`。
    # **0 (grey)**表示**未**分入任何模块的基因。 
    table(net$colors)
    
    #### step4  层级聚类树展示各个模块
    ## 灰色的为**未分类**到模块的基因。
    # Convert labels to colors for plotting
    moduleLabels = net$colors
    moduleColors = labels2colors(moduleLabels)
    # Plot the dendrogram and the module colors underneath
    # 如果对结果不满意,还可以recutBlockwiseTrees,节省计算时间
    plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
                        "Module colors",
                        dendroLabels = FALSE, hang = 0.03,
                        addGuide = TRUE, guideHang = 0.05)
    
    #### step5 绘制模块之间相关性图
    
    # module eigengene, 可以绘制线图,作为每个模块的基因表达趋势的展示
    MEs = net$MEs
    
    ### 不需要重新计算,改下列名字就好
    ### 官方教程是重新计算的,起始可以不用这么麻烦
    MEs_col = MEs
    library(stringr)
    colnames(MEs_col) = paste0("ME", labels2colors(
      as.numeric(str_replace_all(colnames(MEs),"ME",""))))
    MEs_col = orderMEs(MEs_col)
    
    # 根据基因间表达量进行聚类所得到的各模块间的相关性图
    # marDendro/marHeatmap 设置下、左、上、右的边距
    plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap", 
                          marDendro = c(3,3,2,4),
                          marHeatmap = c(3,4,2,2), plotDendrograms = T, 
                          xLabelsAngle = 90)
    
    # ## 如果有表型数据,也可以跟ME数据放一起,一起出图
    # sampleName = rownames(dataExpr)
    # pdata[1:4,1:4]
    # # traitData<- data.frame(sex=pdata$`Sex:ch1`,status = pdata$`status:ch1`)
    # traitData<- data.frame(sex=as.numeric(factor(pdata$`Sex:ch1`)),status = as.numeric(factor(pdata$`status:ch1`)))
    # row.names(traitData)<- row.names(pdata)
    # traitData[1:2,1:2]
    # match(sampleName, rownames(traitData))
    # traitData = traitData[match(sampleName, rownames(traitData)), ]
    # MEs_colpheno = orderMEs(cbind(MEs_col, traitData))
    # plotEigengeneNetworks(MEs_colpheno, "Eigengene adjacency heatmap", 
    #                       marDendro = c(3,3,2,4),
    #                       marHeatmap = c(3,4,2,2), plotDendrograms = T, 
    #                       xLabelsAngle = 90)
    
    #### step6 可视化基因网络 (TOM plot)
    
    # 如果采用分步计算,或设置的blocksize>=总基因数,直接load计算好的TOM结果
    # 否则需要再计算一遍,比较耗费时间
    # TOM = TOMsimilarityFromExpr(dataExpr, power=power, corType=corType, networkType=type)
    load(net$TOMFiles[1], verbose=T)
    
    ## Loading objects:
    ##   TOM
    
    TOM <- as.matrix(TOM)
    TOM[1:4,1:4]
    dissTOM = 1-TOM
    # Transform dissTOM with a power to make moderately strong 
    # connections more visible in the heatmap
    plotTOM = dissTOM^7
    # Set diagonal to NA for a nicer plot
    diag(plotTOM) = NA
    # Call the plot function
    
    # 这一部分特别耗时,行列同时做层级聚类
    moduleColors = labels2colors(moduleLabels)
    
    
    TOMplot(plotTOM, net$dendrograms, moduleColors, 
            main = "Network heatmap plot, all genes")
    ####### 又报错了可能是因为样本量和基因数量太大了,当我减少基因数量和样本量时,没有再次报错
    # Error in TOMplot(plotTOM, net$dendrograms, moduleColors, main = "Network heatmap plot, all genes") : 
    #   ERROR: number of color labels does not equal number of nodes in dissim.
    # nNodes != dim(dissim)[[1]] 
    #### 计算了1个小时,最好的选择是,随机挑选400个基因进行计算 
    # #然后随机选取部分基因作图
    # nSelect = 400
    # # For reproducibility, we set the random seed
    # set.seed(10);
    # select = sample(nGenes, size = nSelect);
    # selectTOM = dissTOM[select, select];
    # # There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
    # selectTree = hclust(as.dist(selectTOM), method = "average")
    # selectColors = moduleColors[select];
    # # Open a graphical window
    # sizeGrWindow(9,9)
    # # Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
    # # the color palette; setting the diagonal to NA also improves the clarity of the plot
    # plotDiss = selectTOM^7;
    # diag(plotDiss) = NA;
    # TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
    
    
    # step7  导出网络用于Cytoscape
    probes = colnames(dataExpr)
    dimnames(TOM) <- list(probes, probes)
    
    # Export the network into edge and node list files Cytoscape can read
    # threshold 默认为0.5, 可以根据自己的需要调整,也可以都导出后在
    # cytoscape中再调整
    cyt = exportNetworkToCytoscape(TOM,
                                   edgeFile = paste(exprMat, ".edges.txt", sep=""),
                                   nodeFile = paste(exprMat, ".nodes.txt", sep=""),
                                   weighted = TRUE, threshold = 0,
                                   nodeNames = probes, nodeAttr = moduleColors)
    
    
    ### step8 关联表型数据
    
    # trait <- "WGCNA/TraitsClean.txt"
    # # 读入表型数据,不是必须的
    # if(trait != "") {
    #   traitData <- read.table(file=trait, sep='\t', header=T, row.names=1,
    #                           check.names=FALSE, comment='',quote="")
    #   sampleName = rownames(dataExpr)
    #   traitData = traitData[match(sampleName, rownames(traitData)), ]
    # }
    
    # sampleName = rownames(dataExpr)
    # pdata[1:4,1:4]
    # # traitData<- data.frame(sex=pdata$`Sex:ch1`,status = pdata$`status:ch1`)
    # traitData<- data.frame(sex=as.numeric(factor(pdata$`Sex:ch1`)),status = as.numeric(factor(pdata$`status:ch1`)))
    # row.names(traitData)<- row.names(pdata)
    # traitData[1:2,1:2]
    # match(sampleName, rownames(traitData))
    # traitData = traitData[match(sampleName, rownames(traitData)), ]
    
    ### 模块与表型数据关联
    if (corType=="pearson") {
      modTraitCor = cor(MEs_col, traitData, use = "p")
      modTraitP = corPvalueStudent(modTraitCor, nSamples)
    } else {
      modTraitCorP = bicorAndPvalue(MEs_col, traitData, robustY=robustY)
      modTraitCor = modTraitCorP$bicor
      modTraitP   = modTraitCorP$p
    }
    
    ## Warning in bicor(x, y, use = use, ...): bicor: zero MAD in variable 'y'.
    ## Pearson correlation was used for individual columns with zero (or missing)
    ## MAD.
    
    # signif表示保留几位小数
    textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
    dim(textMatrix) = dim(modTraitCor)
    labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(traitData), 
                   yLabels = colnames(MEs_col), 
                   cex.lab = 0.5, 
                   ySymbols = colnames(MEs_col), colorLabels = FALSE, 
                   colors = blueWhiteRed(50), 
                   textMatrix = textMatrix, setStdMargins = FALSE, 
                   cex.text = 0.5, zlim = c(-1,1),
                   main = paste("Module-trait relationships"))
    
    #### step9 模块内基因与表型数据关联, 
    
    ## 从上图可以看到MEmagenta与Insulin_ug_l相关
    
    ## 模块内基因与表型数据关联
    
    # 性状跟模块虽然求出了相关性,可以挑选最相关的那些模块来分析,
    # 但是模块本身仍然包含非常多的基因,还需进一步的寻找最重要的基因。
    # 所有的模块都可以跟基因算出相关系数,所有的连续型性状也可以跟基因的表达
    # 值算出相关系数。
    # 如果跟性状显著相关基因也跟某个模块显著相关,那么这些基因可能就非常重要
    # 。
    
    ### 计算模块与基因的相关性矩阵
    
    if (corType=="pearson") {
      geneModuleMembership = as.data.frame(cor(dataExpr, MEs_col, use = "p"))
      MMPvalue = as.data.frame(corPvalueStudent(
        as.matrix(geneModuleMembership), nSamples))
    } else {
      geneModuleMembershipA = bicorAndPvalue(dataExpr, MEs_col, robustY=robustY)
      geneModuleMembership = geneModuleMembershipA$bicor
      MMPvalue   = geneModuleMembershipA$p
    }
    
    
    # 计算性状与基因的相关性矩阵
    
    ## 只有连续型性状才能进行计算,如果是离散变量,在构建样品表时就转为0-1矩阵。
    
    if (corType=="pearson") {
      geneTraitCor = as.data.frame(cor(dataExpr, traitData, use = "p"))
      geneTraitP = as.data.frame(corPvalueStudent(
        as.matrix(geneTraitCor), nSamples))
    } else {
      geneTraitCorA = bicorAndPvalue(dataExpr, traitData, robustY=robustY)
      geneTraitCor = as.data.frame(geneTraitCorA$bicor)
      geneTraitP   = as.data.frame(geneTraitCorA$p)
    }
    
    ## Warning in bicor(x, y, use = use, ...): bicor: zero MAD in variable 'y'.
    ## Pearson correlation was used for individual columns with zero (or missing)
    ## MAD.
    
    # 最后把两个相关性矩阵联合起来,指定感兴趣模块进行分析
    module = "magenta"
    pheno = "sex"
    modNames = substring(colnames(MEs_col), 3)
    # 获取关注的列
    module_column = match(module, modNames)
    pheno_column = match(pheno,colnames(traitData))
    # 获取模块内的基因
    moduleGenes = moduleColors == module
    
    sizeGrWindow(7, 7)
    par(mfrow = c(1,1))
    # 与性状高度相关的基因,也是与性状相关的模型的关键基因
    verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),
                       abs(geneTraitCor[moduleGenes, pheno_column]),
                       xlab = paste("Module Membership in", module, "module"),
                       ylab = paste("Gene significance for", pheno),
                       main = paste("Module membership vs. gene significance\n"),
                       cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
    
    
    
    #### step10 分步法展示每一步都做了什么
    ## 计算邻接矩阵
    adjacency = adjacency(dataExpr, power = power)
    
    ### 把邻接矩阵转换为拓扑重叠矩阵,以降低噪音和假相关,获得距离矩阵。
    TOM = TOMsimilarity(adjacency)
    dissTOM = 1-TOM
    
    ### 层级聚类计算基因之间的距离树 
    geneTree = hclust(as.dist(dissTOM), method = "average")
    
    ### 模块合并
    # We like large modules, so we set the minimum module size relatively high:
    minModuleSize = 30
    # Module identification using dynamic tree cut:
    dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                                deepSplit = 2, pamRespectsDendro = FALSE,
                                minClusterSize = minModuleSize)
    # Convert numeric lables into colors
    dynamicColors = labels2colors(dynamicMods)
    
    ### 通过计算模块的代表性模式和模块之间的定量相似性评估,合并表达图谱相似的模块
    MEList = moduleEigengenes(dataExpr, colors = dynamicColors)
    MEs = MEList$eigengenes
    # Calculate dissimilarity of module eigengenes
    MEDiss = 1-cor(MEs)
    # Cluster module eigengenes
    METree = hclust(as.dist(MEDiss), method = "average")
    MEDissThres = 0.25
    
    # Call an automatic merging function
    merge = mergeCloseModules(dataExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
    # The merged module colors
    mergedColors = merge$colors;
    # Eigengenes of the new merged
    
    ## 分步法完结
    
    
    

    相关文章

      网友评论

        本文标题:WGCNA

        本文链接:https://www.haomeiwen.com/subject/ujxvaqtx.html