WGCNA分析代码

作者: 纵春水东流 | 来源:发表于2019-07-15 23:34 被阅读18次

    黑暗网址https://zhuanlan.zhihu.com/p/42970323

    1、分析流程

    image.png

    2、过程

    #1第1步:输入数据的准备
    
    

    9、代码部分

    #读入原始数据
    #原始数据包含64个样本,9904个lncRNA表达量,其中第一列是lncRNA_ID,第66列是Annotation。
    setwd("G:/My_exercise/WGCNA/")
    lncRNAexpr <- read.csv("All_sample_LncRNA_exp_RPKM.csv",sep=",",header = T)
    #重命名数据列表,行名和列名
    fpkm <- lncRNAexpr[,-66]##去掉Annotation这列
    rownames(fpkm)=fpkm[,1]
    fpkm=fpkm[,-1] 
    #准备表型信息
    subname=sapply(colnames(fpkm),function(x) strsplit(x,"_")[[1]][1])
    datTraits = data.frame(gsm=names(fpkm),
                           subtype=subname)
    rownames(datTraits)=datTraits[,1]
    head(datTraits)
    #下载并载入WGCNA包
    source("http://bioconductor.org/biocLite.R")
    #biocLite(c("AnnotationDbi", "impute", "GO.db", "preprocessCore"))
    #如果已经下载过了,这里就不用下载
    biocLite("WGCNA")
    library(WGCNA)
    #行列转置
    #WGCNA针对的是基因进行聚类,而一般我们的聚类是针对样本
    #用hclust即可,也就是说要转置为行名是样本,列名是基因。
    RNAseq_voom <- fpkm 
    WGCNA_matrix = t(RNAseq_voom[order(apply(RNAseq_voom,1,mad), decreasing = T)[1:5000],])
    datExpr <- WGCNA_matrix  ## top 5000 mad genes
    datExpr[1:4,1:4]
    #确定临床表型与样本名字
    sampleNames = rownames(datExpr);
    traitRows = match(sampleNames, datTraits$gsm)  
    rownames(datTraits) = datTraits[traitRows, 1]
    
    #STEP2:确定最佳soft-thresholding power
    # Choose a set of soft-thresholding powers
    powers = c(c(1:10), seq(from = 12, to=20, by=2))
    # Call the network topology analysis function
    sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
    # Plot the results:
    par(mfrow = c(1,2));
    cex1 = 0.9;
    # Scale-free topology fit index as a function of the soft-thresholding power
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
         main = paste("Scale independence"));
    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         labels=powers,cex=cex1,col="red");
    # this line corresponds to using an R^2 cut-off of h
    abline(h=0.90,col="green")
    # Mean connectivity as a function of the soft-thresholding power
    plot(sft$fitIndices[,1], sft$fitIndices[,5],
         xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
         main = paste("Mean connectivity"))
    text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
    #选择beta值
    best_beta=sft$powerEstimate
    #> best_beta
    [1] 3
    
    #STEP3: 一步法构建共表达矩阵
    net = blockwiseModules(datExpr, power = sft$powerEstimate,
                           maxBlockSize = 6000,TOMType = "unsigned", 
                           minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,
                           numericLabels = TRUE, pamRespectsDendro = FALSE,
                           saveTOMs = TRUE,
                           saveTOMFileBase = "AS-green-FPKM-TOM",
                           verbose = 3)
    
    #STEP4:模块鉴定及可视化
    table(net$colors)
    # Convert labels to colors for plotting
    mergedColors = labels2colors(net$colors)
    table(mergedColors)
    # Plot the dendrogram and the module colors underneath
    plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                        "Module colors",
                        dendroLabels = FALSE, hang = 0.03,
                        addGuide = TRUE, guideHang = 0.05)
    ## assign all of the gene to their corresponding module 
    ## hclust for the genes.
    #明确样本数和基因数
    nGenes = ncol(datExpr)
    nSamples = nrow(datExpr)
    #首先针对样本做个系统聚类树
    datExpr_tree<-hclust(dist(datExpr), method = "average")
    par(mar = c(0,5,2,0))
    plot(datExpr_tree, main = "Sample clustering", sub="", xlab="", cex.lab = 2, 
         cex.axis = 1, cex.main = 1,cex.lab=1)
    ## 如果这个时候样本是有性状,或者临床表型的,可以加进去看看是否聚类合理
    #针对前面构造的样品矩阵添加对应颜色
    sample_colors <- numbers2colors(as.numeric(factor(datTraits$subtype)), 
                                    colors = c("grey","blue","red","green"),signed = FALSE)
    ## 这个给样品添加对应颜色的代码需要自行修改以适应自己的数据分析项目。
    #  sample_colors <- numbers2colors( datTraits ,signed = FALSE)
    ## 如果样品有多种分类情况,而且 datTraits 里面都是分类信息,那么可以直接用上面代码,
    ##当然,这样给的颜色不明显,意义不大。
    #构造10个样品的系统聚类树及性状热图
    par(mar = c(1,4,3,1),cex=0.8)
    plotDendroAndColors(datExpr_tree, sample_colors,
                        groupLabels = colnames(sample),
                        cex.dendroLabels = 0.8,
                        marAll = c(1, 4, 3, 1),
                        cex.rowText = 0.01,
                        main = "Sample dendrogram and trait heatmap")
    
    #STEP5:模块和性状的关系
    design=model.matrix(~0+ datTraits$subtype)
    colnames(design)=levels(datTraits$subtype)
    moduleColors <- labels2colors(net$colors)
    # Recalculate MEs with color labels
    MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
    MEs = orderMEs(MEs0); ##不同颜色的模块的ME值矩阵(样本vs模块)
    moduleTraitCor = cor(MEs, design , use = "p");
    moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
    
    sizeGrWindow(10,6)
    # Will display correlations and their p-values
    textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                       signif(moduleTraitPvalue, 1), ")", sep = "");
    dim(textMatrix) = dim(moduleTraitCor)
    par(mar = c(6, 8.5, 3, 3));
    # Display the correlation values within a heatmap plot
    labeledHeatmap(Matrix = moduleTraitCor,
                   xLabels = names(design),
                   yLabels = names(MEs),
                   ySymbols = names(MEs),
                   colorLabels = FALSE,
                   colors = greenWhiteRed(50),
                   textMatrix = textMatrix,
                   setStdMargins = FALSE,
                   cex.text = 0.5,
                   zlim = c(-1,1),
                   main = paste("Module-trait relationships"))
    
    #STEP6:感兴趣性状的模块的具体基因分析
    ##首先计算模块与基因的相关性矩阵
    # names (colors) of the modules
    modNames = substring(names(MEs), 3)
    geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
    ## 算出每个模块跟基因的皮尔森相关系数矩阵
    ## MEs是每个模块在每个样本里面的值
    ## datExpr是每个基因在每个样本的表达量
    MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
    names(geneModuleMembership) = paste("MM", modNames, sep="");
    names(MMPvalue) = paste("p.MM", modNames, sep="");
    ##再计算性状与基因的相关性矩阵
    ## 只有连续型性状才能只有计算
    ## 这里把是否属于 CB 表型这个变量用0,1进行数值化。
    CB = as.data.frame(design[,2]);
    names(CB) = "CB"
    geneTraitSignificance = as.data.frame(cor(datExpr, CB, use = "p"));
    GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
    names(geneTraitSignificance) = paste("GS.", names(CB), sep="");
    names(GSPvalue) = paste("p.GS.", names(CB), sep="")
    最后把两个相关性矩阵联合起来,指定感兴趣模块进行分析
    module = "turquoise"
    column = match(module, modNames);
    moduleGenes = moduleColors==module;
    sizeGrWindow(7, 7);
    par(mfrow = c(1,1));
    verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                       abs(geneTraitSignificance[moduleGenes, 1]),
                       xlab = paste("Module Membership in", module, "module"),
                       ylab = "Gene significance for CB",
                       main = paste("Module membership vs. gene significance\n"),
                       cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
    
    
    #STEP7:网络的可视化
    #首先针对所有基因画热图
    nGenes = ncol(datExpr)
    nSamples = nrow(datExpr)
    geneTree = net$dendrograms[[1]]; 
    dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = sft$powerEstimate); # 设置power=sft$powerEstimate,最佳beta值,此处是3
    plotTOM = dissTOM^7; 
    diag(plotTOM) = NA; 
    #TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
    
    #然后随机选取部分基因作图
    nSelect = 400
    # For reproducibility, we set the random seed
    set.seed(10);
    select = sample(nGenes, size = nSelect);
    selectTOM = dissTOM[select, select];
    # There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
    selectTree = hclust(as.dist(selectTOM), method = "average")
    selectColors = moduleColors[select];
    # Open a graphical window
    sizeGrWindow(9,9)
    # Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
    # the color palette; setting the diagonal to NA also improves the clarity of the plot
    plotDiss = selectTOM^7;
    diag(plotDiss) = NA;
    TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
    #最后画模块和性状的关系
    # Recalculate module eigengenes
    MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
    ## 只有连续型性状才能只有计算
    ## 这里把是否属于 Luminal 表型这个变量用0,1进行数值化。
    CB = as.data.frame(design[,2]);
    names(CB) = "CB"
    # Add the weight to existing module eigengenes
    MET = orderMEs(cbind(MEs, CB))
    # Plot the relationships among the eigengenes and the trait
    sizeGrWindow(5,7.5);
    par(cex = 0.9)
    plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle
                          = 90)
    # Plot the dendrogram
    sizeGrWindow(6,6);
    par(cex = 1.0)
    ## 模块的聚类图
    plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
                          plotHeatmaps = FALSE)
    # Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
    par(cex = 1.0)
    ## 性状与模块热图
    plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
                          plotDendrograms = FALSE, xLabelsAngle = 90)
    #STEP8:提取指定模块的基因名
    #提取基因信息,进行下游分析包括GO/KEGG等功能数据库的注释
    # Select module
    module = "turquoise";
    # Select module probes
    probes = colnames(datExpr) ## 我们例子里面的probe就是基因名
    inModule = (moduleColors==module);
    modProbes = probes[inModule];
    #GO分析
    #STEP9: 模块的导出
    # Recalculate topological overlap
    TOM = TOMsimilarityFromExpr(datExpr, power = sft$powerEstimate); 
    # Select module
    module = "turquoise";
    # Select module probes
    probes = colnames(datExpr) ## 我们例子里面的probe就是基因名
    inModule = (moduleColors==module);
    modProbes = probes[inModule]; 
    ## 也是提取指定模块的基因名
    # Select the corresponding Topological Overlap
    modTOM = TOM[inModule, inModule];
    dimnames(modTOM) = list(modProbes, modProbes)
    #模块对应的基因关系矩阵
    #首先是导出到VisANT
    vis = exportNetworkToVisANT(modTOM,
                                file = paste("VisANTInput-", module, ".txt", sep=""),
                                weighted = TRUE,
                                threshold = 0)
    然后是导出到cytoscape
    cyt = exportNetworkToCytoscape(
      modTOM,
      edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""),
      nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""),
      weighted = TRUE,
      threshold = 0.02,
      nodeNames = modProbes, 
      nodeAttr = moduleColors[inModule]
    );
    
    #STEP10: 模块内的分析—— 提取hub genes
    #hub genes指模块中连通性(connectivity)较高的基因,如设定排名
    #前30或前10%)。
    #高连通性的Hub基因通常为调控因子(调控网络中处于偏上游的位
    #置),而低连通性的基因通常为调控网络中偏下游的基因(例如,转运蛋白、催化酶等)。
    #HubGene: |kME| >=阈值(0.8)
    #(1)计算连通性
    ### Intramodular connectivity, module membership, and screening for intramodular hub genes
    
    # (1) Intramodular connectivity
    
    # moduleColors <- labels2colors(net$colors)
    connet=abs(cor(datExpr,use="p"))^6
    Alldegrees1=intramodularConnectivity(connet, moduleColors)
    head(Alldegrees1)
    #(2)模块内的连通性与gene显著性的关系
    # (2) Relationship between gene significance and intramodular connectivity
    which.module="black"
    EB= as.data.frame(design[,2]); # change specific 
    names(EB) = "EB"
    GS1 = as.numeric(cor(EB,datExpr, use="p"))
    GeneSignificance=abs(GS1)
    colorlevels=unique(moduleColors)
    sizeGrWindow(9,6)
    par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))
    par(mar = c(4,5,3,1))
    for (i in c(1:length(colorlevels)))
    {
      whichmodule=colorlevels[[i]];
      restrict1 = (moduleColors==whichmodule);
      verboseScatterplot(Alldegrees1$kWithin[restrict1],
                         GeneSignificance[restrict1], col=moduleColors[restrict1],
                         main=whichmodule,
                         xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)
    }
    #(3)计算模块内所有基因的连通性, 筛选hub genes
    #abs(GS1)> .9 可以根据实际情况调整参数
    #abs(datKME$MM.black)>.8 至少大于 >0.8
    ###(3) Generalizing intramodular connectivity for all genes on the array
    datKME=signedKME(datExpr, MEs, outputColumnName="MM.")
    # Display the first few rows of the data frame
    head(datKME)
    ##Finding genes with high gene significance and high intramodular connectivity in
    # interesting modules
    # abs(GS1)> .9 可以根据实际情况调整参数
    # abs(datKME$MM.black)>.8 至少大于 >0.8
    FilterGenes= abs(GS1)> .9 & abs(datKME$MM.black)>.8
    table(FilterGenes)
    
    #STEP11: 其他分析
    #(1) another plot for realtionship between module eigengenes
    plotMEpairs(MEs,y=datTraits$cellType)
    (2) Diagnostics: heatmap plots of module expression
    sizeGrWindow(8,9)
    #par(mfrow=c(3,1), mar=c(1, 2, 4, 1))
    # for black module
    which.module="blue";
    plotMat(t(scale(datExpr[,moduleColors==which.module ]) ),nrgcols=30,rlabels=T,
            clabels=T,rcols=which.module,
            title=which.module )
    #(3) Diagnostics: displaying module heatmap and the eigengene
    sizeGrWindow(8,7);
    which.module="blue"
    ME=MEs[, paste("ME",which.module, sep="")]
    par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))
    plotMat(t(scale(datExpr[,moduleColors==which.module ]) ),
            nrgcols=30,rlabels=F,rcols=which.module,
            main=which.module, cex.main=2)
    par(mar=c(5, 4.2, 0, 0.7))
    barplot(ME, col=which.module, main="", cex.main=2,
            ylab="eigengene expression",xlab="MPP")
    
    
    
    
    
    
    
    
    
    
    

    其他实验代码

    其他代码
    source("http://bioconductor.org/biocLite.R")
    #biocLite()
    biocLite("affy")
    biocLite("hgu133plus2cdf")
    setwd("C:/Users/tvtZz/Downloads/GSE36272_RAW")
    library(affy)
    files =list.files("C:/Users/tvtZz/Downloads/GSE36272_RAW", full.names = TRUE)
    affy.data = ReadAffy(filenames = files)
    eset.mas5 = mas5(affy.data)
    exprSet.nologs = exprs(eset.mas5)
    #exprSet.nologs = preprocessCore::normalize.quntiles(eset.mas5)
    colnames(exprSet.nologs)
    colnames(exprSet.nologs)=c("GSM885697_25-Nip-Pxo99-1.CEL","GSM885698_26-Nip-Pxo99-2.CEL","GSM885699_27-Nip-Pxo99-3.CEL","GSM885700_31-Nip-ME7-1.CEL","GSM885701_32-Nip-ME7-2.CEL","GSM885702_33-Nip-ME7-3.CEL","GSM885703_28-Nip-ME5-1.CEL","GSM885704_29-Nip-ME5-2.CEL","GSM885705_30-Nip-ME5-3.CEL","GSM885706_43-Nip-H2O-1.CEL","GSM885707_44-Nip-H2O-2.CEL","GSM885708_45-Nip-H2O-3.CEL","GSM885709_46-Nip-NI-1.CEL","GSM885710_48-Nip-NI-3.CEL","GSM885711_37-Nip-T7174-1.CEL""GSM885712_38-Nip-T7174-2.CEL","GSM885713_39-Nip-T7174-3.CEL","GSM885714_34-Nip-PXO86-1.CEL","GSM885715_35-Nip-PXO86-2.CEL","GSM885716_36-Nip-PXO86-3.CEL","GSM885717_40-Nip-BLS303-1.CEL","GSM885718_41-Nip-BLS303-2.CEL","GSM885719_42-Nip-BLS303-3.CEL","GSM885720_Nip-ME1-1_7.CEL","GSM885721_Nip-ME1-2_8.CEL","GSM885722_Nip-ME1-3_9.CEL","GSM885723_Nip-ME2-1_4.CEL","GSM885724_Nip-ME2-2_5.CEL","GSM885725_Nip-ME2-3_6.CEL","GSM885726_Nip-99-1b_1.CEL","GSM885727_Nip-99-2b_2.CEL","GSM885728_Nip-99-3b_3.CEL")
    exprSet = log(exprSet.nologs, 2)
    data.mas5calls = mas5calls(affy.data)
    data.mas5calls.calls = exprs(data.mas5calls)
    resmean=apply(exprSet[,c("GSM885697_25-Nip-Pxo99-1.CEL","GSM885698_26-Nip-Pxo99-2.CEL","GSM885699_27-Nip-Pxo99-3.CEL","GSM885700_31-Nip-ME7-1.CEL","GSM885701_32-Nip-ME7-2.CEL","GSM885702_33-Nip-ME7-3.CEL","GSM885703_28-Nip-ME5-1.CEL","GSM885704_29-Nip-ME5-2.CEL","GSM885705_30-Nip-ME5-3.CEL","GSM885706_43-Nip-H2O-1.CEL","GSM885707_44-Nip-H2O-2.CEL","GSM885708_45-Nip-H2O-3.CEL","GSM885709_46-Nip-NI-1.CEL","GSM885710_48-Nip-NI-3.CEL","GSM885711_37-Nip-T7174-1.CEL","GSM885712_38-Nip-T7174-2.CEL","GSM885713_39-Nip-T7174-3.CEL","GSM885714_34-Nip-PXO86-1.CEL","GSM885715_35-Nip-PXO86-2.CEL","GSM885716_36-Nip-PXO86-3.CEL","GSM885717_40-Nip-BLS303-1.CEL","GSM885718_41-Nip-BLS303-2.CEL","GSM885719_42-Nip-BLS303-3.CEL","GSM885720_Nip-ME1-1_7.CEL","GSM885721_Nip-ME1-2_8.CEL","GSM885722_Nip-ME1-3_9.CEL","GSM885723_Nip-ME2-1_4.CEL","GSM885724_Nip-ME2-2_5.CEL","GSM885725_Nip-ME2-3_6.CEL","GSM885726_Nip-99-1b_1.CEL","GSM885727_Nip-99-2b_2.CEL","GSM885728_Nip-99-3b_3.CEL")],1,mean)
    negmean=apply(exprSet[,c("GSM885729_1-IR24-99-1.CEL","GSM885730_2-IR24-99-2.CEL","GSM885731_3-IR24-99-3.CEL","GSM885732_7-IR24-ME7-1.CEL","GSM885733_8-IR24-ME7-2.CEL","GSM885734_9-IR-24-ME7-3.CEL","GSM885735_4-IR24-ME5-1.CEL","GSM885739_20-IR-24-H20-2.CEL","GSM885740_21-IR-24-H20-3.CEL","GSM885741_22-IR-24-NI-1.CEL","GSM885742_23-IR-24-NI-2.CEL","GSM885743_24-IR-24-NI-3.CEL","GSM885744_13-IR-24-T-7174-1.CEL","GSM885745_14-IR-24-T-7174-2.CEL","GSM885746_15-IR-24-T-7174-3.CEL","GSM885747_10-IR-24-PXO-86-1.CEL","GSM885748_11-IR-24-PXO-86-2.CEL","GSM885749_12-IR-24-PXO-86-3.CEL","GSM885750_16-IR-24-BLS30J-1.CEL","GSM885751_17-IR-24-BLS3O3-2.CEL","GSM885752_18-IR-24-BLS3O3-3.CEL","GSM885753_IR24-ME1-1_16.CEL","GSM885754_IR24-ME1-2_17.CEL","GSM885755_IR24-ME1-3_18.CEL","GSM885756_IR24-ME2-1_15.CEL","GSM885757_IR24-ME2-2_13.CEL","GSM885758_IR24-ME2-3_14.CEL","GSM885759_IR24-99-1b_10.CEL","GSM885760_IR24-99-2b_11.CEL","GSM885761_IR24-99-3b_12.CEL" )],1,mean)
    diff=resmean-negmean
    pvalue=apply(exprSet, 1, function(x) {
        t.test(x[1:32], x[33:65])$p.value
    })
    pvalue=p.adjust(pvalue,"fdr")
    AP = apply(data.mas5calls.calls, 1, paste, collapse = "")
    #biocLite("stringr")
    library(stringr)
    genes.present = names(AP[str_count(AP, pattern = "A")<20])
    exprSet.present = exprSet[genes.present, ]
    -
    exprSet.significant = exprSet[intersect(genes.present,names(pvalue[pvalue<0.01])),]
    write.csv(exprSet.significant,"exprSet.significant.csv",sep=",")
    heatmap(exprSet.significant, main = "transcriptional responses of two susceptible rice cultivars to strains of Xanthomonas oryzae")
    ɾ³ýaffy»­Í¼
    x<-read.csv(file = "exprSet.significant_2.csv",header=T,row.names = 1)
    exprSet.significant<-as.matrix(x)
    heatmap(exprSet.significant, main = "transcriptional responses of two susceptible rice cultivars to strains of Xanthomonas oryzae")
    
    setwd("/home/student3/wgcna-zzy/")
     getwd()
    library(WGCNA);
    enableWGCNAThreads(2)
    #memory.limit(6000)
    options(stringsAsFactors = FALSE);#²»ÒªÈûùÒòºÅ×÷ΪÒòËØ
    #datTraits=read.csv("exprSet.significant.csv")
    dat0=read.csv("exprSet.significant.csv",header=TRUE)    #sep='\t',use read.table for txt file,colClasses="numeric" when data is integer,storage.mode(datExpr) = "double" for REAL() can only be applied to a 'numeric' error suggests none-num in the last column or some whereelse
    datSummary=dat0[,1:1];dim(dat0)
    
    datExpr = t(dat0[,2: ncol(dat0)])
    no.samples = dim(datExpr)[[1]]
    dim(datExpr)
    ArrayName= names(data.frame(dat0[,-1]))
    GeneName= dat0$GeneID
    datExpr=data.frame(t(dat0[,-1]))
    names(datExpr)=dat0[,1]
    #dimnames(datExpr)[[1]]=names(data.frame(dat0[,-1]))
    #table(dimnames(datExpr)[[1]]==datTraits$sample)
    #y=datTraits$gender
    #z=datTraits$age
    #sizeGrWindow(9, 5)
    #pdf("ClusterTreeSamples.pdf")
    #plotClusterTreeSamples(datExpr=datExpr, y=y)
    #dev.off()
    #rm(dat0);
    #gc()
    
    memory.size=100000
    powers=c(seq(1,10,by=1),seq(12,14,by=2))#¸øpowers¼¸¸öÖµÊÔ
    sft=pickSoftThreshold(datExpr, powerVector=powers,networkType = "signed") ÕÒ×îºÃµÄprower ,¾ØÕóÖÐR.sqÖУ¬ÖµÔ½¸ßÔ½ºÃ£¨0.8-0.9£©£¬µ«Ò²Òª×¢ÒâpowerÖµ²»Òª¹ý¸ß
    
    RpowerTable=sft[[2]]
    sizeGrWindow(9, 5);pdf('choosing power.pdf');par(mfrow = c(1,2));cex1 = 0.9;
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
    xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"));
    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red");
    abline(h=0.90,col="red");dev.off()
    # Mean connectivity as a function of the soft-thresholding power
    sizeGrWindow(9, 5);pdf('mean connectivity.pdf');plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))
    text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red");dev.off()
    
    softPower =14
    Connectivity=softConnectivity(datExpr,corFnc = "cor", corOptions = "use ='p'",power=softPower)  #Ƥ¶ûÉ­Ïà¹ØϵÊý cor
    pdf("scale-free.pdf");scaleFreePlot(Connectivity,nBreaks = 10,truncated = FALSE,removeFirst = FALSE, main = "");dev.off()
    adjacency = adjacency(datExpr,corFnc = "cor", corOptions = "use ='p'",type = "signed", power = softPower)
    TOM = TOMsimilarity(adjacency,TOMType="signed");
    dissTOM = 1-TOM   #1-ÏàËƶȣ¬Ò»Ñù£¬Öظ´¾àÀëΪ0
    #method="complete"  ?
    geneTree = hclust(as.dist(dissTOM), method = "average")#¸ß°æ±¾ÒѾ­ÓÃhclust   h ²ã´Î¾ÛÀà
    minModuleSize =30;#Éè×îС¾ÛÀ࣬ʹ½á¹û¸ü¿ÉÐÅ
    dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,deepSplit = 0, pamRespectsDendro = FALSE,minClusterSize = minModuleSize,cutHeight=0.99);#¶¯Ì¬ÇÐ¸î £¬deepsplitÈ¡Öµ0-4 Ô½¸ßÔ½Ãô¸Ð£¬ÏàËƶȸ߿ÉÉè¸ßÒ»µã
    table(dynamicMods) #table½øÐÐ×Ü½á  ÓÐ0 µÄÊÇ»ùÒòÊÇ
    dynamicColors = labels2colors(dynamicMods) #°Ñ1 2 3±êÇ©¸Ä³ÉÑÕÉ«
    table(dynamicColors)
    
    #the following codes, dynamicColors was changed to dynamicMods for num labeling than Color
    MEList = moduleEigengenes(datExpr, colors = dynamicMods) #ÒÔÄ£¿éΪµ¥Î»¼ÆËã»ùÒò±í´ï PCA½µÎ¬
    MEs = MEList$eigengenes
    MEDiss = 1-cor(MEs); #ËãÄ£¿éÓëÄ£¿éÖ®¼äµÄ¾àÀë
    METree = hclust(as.dist(MEDiss), method = "average");#
    sizeGrWindow(7, 6)
    plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")
    MEDissThres = 0.2
    abline(h=MEDissThres, col = "red")
    merge = mergeCloseModules(datExpr, dynamicMods, cutHeight = MEDissThres, verbose = 3);#ºÏ²¢½üµÄÄ£¿é
    mergedColors = merge$colors;#¸üÐÂÄ£¿éÑÕÉ«
    mergedMEs = merge$newMEs;
    sizeGrWindow(12, 9)
    pdf("DendroAndColors.pdf")
    plotDendroAndColors(geneTree, cbind(dynamicMods, mergedColors),c("Dynamic Tree Cut", "Merged dynamic"),dendroLabels = FALSE, hang = 0.03,addGuide = TRUE, guideHang = 0.05)
    dev.off()
    
    moduleColors = mergedColors
    colorOrder = c("grey", standardColors(unique(moduleColors)));
    moduleLabels = match(moduleColors, colorOrder)-1;
    MEs = mergedMEs;
    MEDiss = 1-cor(MEs);
    METree = hclust(as.dist(MEDiss), method = "average");#
    pdf("METree.pdf")
    plot(METree, main = "Clustering of module eigengenes",xlab = "", sub = "")
    dev.off()
    
    
    MEList = moduleEigengenes(datExpr, colors = dynamicMods)
    nSamples=nrow(datExpr)
    geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
    MMPvalue = cbind.data.frame(datSummary,corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
    write.table(data.frame(ArrayName,MEs),"MEs.csv",row.name=F) 
    kMEdat=data.frame(geneModuleMembership,MMPvalue)
    write.table(data.frame(datSummary,kMEdat),"kME-MMPvalue.csv",row.names=FALSE)
    datout=data.frame(datSummary, colorNEW=moduleColors, ConnectivityNew=Connectivity)
    write.table(datout, file="OutputCancerNetwork.csv", sep=",", row.names=F)
    k.in=intramodularConnectivity(adjacency(datExpr,,corFnc = "cor", corOptions = "use ='p'", type = "signed", power = softPower), moduleColors,scaleByMax = FALSE)
    write.table(data.frame(datSummary,k.in),"intraK.csv",sep=',',row.names=FALSE)
    hubs    = chooseTopHubInEachModule(datExpr, moduleColors)
    write.csv(data.frame(unique(moduleColors)[!unique(moduleColors) %in% 0)],labels2colors(unique(moduleColors)[!(unique(moduleColors) %in% 0)]),hubs[unique(moduleColors)]),"num2color.csv",row.names=F)
    
    #×Ô¼ºµÄ
    
     data.frame(unique(moduleColors)[!unique(moduleColors) %in% 0],labels2colors(unique(moduleColors)[!unique(moduleColors) %in% 0]),hubs[!is.na(hubs[unique(moduleColors)])])
     
     write.table(data.frame(unique(moduleColors)[!unique(moduleColors) %in% 0],labels2colors(unique(moduleColors)[!unique(moduleColors) %in% 0])),"num2color.csv",row.names=F)
    
    #»­TOMÈÈͼ£º¿É²»×ö
    pdf("TOM.pdf")
    TOMplot(dissTOM , geneTree, moduleColors)
    dev.off()
    #»­MDSͼ£º¿É²»×ö
    pdf("MDS.pdf")
    cmd1=cmdscale(as.dist(dissTOM),2)
    par(mfrow=c(1,1))#par ²ÎÊýµÄËõд£¬ÉèÖû­Í¼²ÎÊý   °Ñ¶àάµÄÊý¾ÝÓ³Éäµ½¶þά£¬TSNA
    plot(cmd1, col=as.character(moduleColors),  main="MDS plot",xlab="Scaling Dimension 1",ylab="Scaling Dimension 2")
    dev.off()
    #±í´ïÈÈͼ,restConnectivityCutÓÃgene¸öÊý´úÌ棺¿É²»×ö
    ConnectivityCut = 5000 #ãÐÖµÉèÖµ Ëæ±ã¸øÒ»¸ö²ÎÊý
    ConnectivityRank = rank(-Connectivity)  #¸øÁÐÏòÁ¿ÅÅÐò -ºÅÉýÐò
    restConnectivity = ConnectivityRank <= ConnectivityCut #Ñ¡Ç°500¸ö×ö±í´ïÈÈͼ
    par(mfrow=c(2,1), mar=c(1, 2, 4, 1))
    ClusterSamples=hclust(dist(datExpr[,restConnectivity] ),method="average") #
    # for the first (turquoise) module we use
    which.module="turquoise"
    pdf("Heat.pdf")
    plotMat(t(scale(datExpr[ClusterSamples$order,restConnectivity][,moduleColors==which.module ]) ),nrgcols=30,rlabels=T, clabels=T,rcols=which.module,main=which.module )
    dev.off()
    # for the second (blue) module we use
    which.module="brown"
    plot.mat(t(scale(datExpr[ClusterSamples$order,restConnectivity][,moduleColors==which.module ]) ),nrgcols=30,rlabels=T, clabels=T,rcols=which.module,
    main=which.module )
    
    
    
    #displaying module heatmap and the eigengene
    pdf("modheatmap.pdf")
    sizeGrWindow(8,7);
    which.module="green"
    ME=MEs[, paste("ME",which.module, sep="")]
    par(mfrow=c(2,1), mar=c(0.3, 5.5, 3, 2))
    plotmat(t(scale(datExpr[,moduleColors==which.module ])),nrgcols=30,rlabels=F,rcols=which.module,main=which.module, cex.main=2)
    dev.off()
    par(mar=c(5, 4.2, 0, 0.7))
    pdf("eigenebar.pdf")
    barplot(ME, col=which.module, main="", cex.main=2,ylab="eigengene expression",xlab="array sample")
    dev.off()
    
    #Correlate the module eigengenes with the trait
    signif(cor(y,MEs, use="p"),2)
    p.values = corPvalueStudent(cor(y,MEs, use="p"), nSamples = length(y))
    #Measure of module signicance as average gene signicance
    GS1=as.numeric(cor(y,datExpr, use="p"))
    GeneSignificance=abs(GS1)
    # Next module significance is defined as average gene significance.
    ModuleSignificance=tapply(GeneSignificance, moduleColors, mean, na.rm=T)
    sizeGrWindow(8,7)
    par(mfrow = c(1,1))
    plotModuleSignificance(GeneSignificance,moduleColors)
    
    #plot Gene signicance (y-axis) vs. intramodular connectivity (x-axis) »­Í¼gene sigºÍÁ¬½Ó¶ÈÉ¢µã
    colorlevels=unique(moduleColors)
    sizeGrWindow(9,6)
    par(mfrow=c(2,as.integer(0.5+length(colorlevels)/2)))
    par(mar = c(4,5,3,1))
    for (i in c(1:length(colorlevels)))
    {
    whichmodule=colorlevels[[i]];
    restrict1 = (moduleColors==whichmodule);
    verboseScatterplot(k.in$kWithin[restrict1],
    GeneSignificance[restrict1], col=moduleColors[restrict1],
    main=whichmodule,
    xlab = "Connectivity", ylab = "Gene Significance", abline = TRUE)
    }
    
    #Generalizing intramodular connectivity for all genes on the array
    datKME=signedKME(datExpr, MEs, outputColumnName="MM.")
    # Display the first few rows of the data frame
    head(datKME)
    
    #Output of the results of network screening analysis
    NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=3, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)
    GeneResultsNetworkScreening=data.frame(GeneName=row.names(NS1), NS1)
    #write.table(GeneResultsNetworkScreening, file="GeneResultsNetworkScreening.csv",row.names=F,sep=",")
    MEsy = data.frame(y, MEs)
    eigengeneSignificance = cor(MEsy, y);
    eigengeneSignificance[1,1] = (1+max(eigengeneSignificance[-1, 1]))/2
    eigengeneSignificance.pvalue = corPvalueStudent(eigengeneSignificance, nSamples = length(y))
    namesME=names(MEsy)
    # Form a summary data frame
    out1=data.frame(t(data.frame(eigengeneSignificance,eigengeneSignificance.pvalue, namesME, t(MEsy))))
    # Set appropriate row names
    dimnames(out1)[[1]][1]="EigengeneSignificance"
    dimnames(out1)[[1]][2]="EigengeneSignificancePvalue"
    dimnames(out1)[[1]][3]="ModuleEigengeneName"
    dimnames(out1)[[1]][-c(1:3)]=dimnames(datExpr)[[1]]
    # Write the data frame into a file
    write.table(out1, file="MEResultsNetworkScreening.csv", row.names=TRUE, col.names = TRUE, sep=",")
    GeneName=dimnames(datExpr)[[2]]
    GeneSummary=data.frame(GeneName, moduleColors, NS1)
    write.table(GeneSummary, file="GeneSummaryTutorial.csv", row.names=F,sep=",")
    datTraits=data.frame(ArrayName, MEsy)
    dimnames(datTraits)[[2]][2:length(namesME)]=paste("Trait",dimnames(datTraits)[[2]][2:length(namesME)],sep=".")
    write.table(datTraits, file="TraitsTutorial.csv", row.names=F,sep=",")
    
    #Relationships among the top 30 most signicant genes,correlation heatmaps for signed network:
    sizeGrWindow(7,7)
    NS1=networkScreening(y=y, datME=MEs, datExpr=datExpr,oddPower=3, blockSize=10000, minimumSampleSize=4,addMEy=TRUE, removeDiag=FALSE, weightESy=0.5)
    topList=rank(NS1$p.Weighted,ties.method="first")<=30
    gene.names= names(datExpr)[topList]
    # The following shows the correlations between the top genes
    plotNetworkHeatmap(datExpr, plotGenes = gene.names,networkType="signed", useTOM=FALSE,power=1, main="signed correlations")
    
    #Exporting to Cytoscape£»¿É²Î¿¼cytoscapeCode.txt    ÌáÈ¡Ä£¿éÐÅÏ¢
    # Recalculate topological overlap if needed
    #TOM = TOMsimilarityFromExpr(datExpr, power = 6,TOMType="signed");
    # Read in the annotation file
    #annot = read.csv(file = "HGU1332-annot.csv");
    # Select modules
    modules = c("black","blue","brown","cyan","green","greenyellow","magenta","midnightblue","pink","purple","red","salmon","turquoise","yellow");
    # Select module probes
    probes = dat0[,1]
    inModule = is.finite(match(moduleColors, modules));
    modProbes = probes[inModule];
    #modGenes = annot$To[match(modProbes, annot$From)];
    # Select the corresponding Topological Overlap
    modTOM = TOM[inModule, inModule];
    dimnames(modTOM) = list(modProbes, modProbes)
    # Export the network into edge and node list files Cytoscape can read
    cyt = exportNetworkToCytoscape(modTOM,
    edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
    nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
    weighted = TRUE,threshold = 0.02,nodeNames = modProbes ,nodeAttr = moduleColors[inModule]);
    #automatic finish the Cytoscape mods
    n=length(unique(moduleColors))  #Ä£¿éÊý
    pb <- txtProgressBar(min = 0, max = n, style = 3) #½ø¶ÈÌõ
    for (p in 1:n){ modules=unique(moduleColors)[p]
    inModule = is.finite(match(moduleColors,modules));
    modProbes = probes[inModule]; #¼ÆËãÏà¹Ø¶È Ìáȡ̽Õë
    modTOM = TOM[inModule, inModule];#̽Õë¶ÔÓ¦TOMÖµ
    dimnames(modTOM) = list(modProbes, modProbes)
    cyt = exportNetworkToCytoscape(modTOM,
    edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
    #nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
    weighted = TRUE,threshold = 0.02,nodeNames = modProbes ,nodeAttr = moduleColors[inModule]);  #threshold can be replaced by quantile(abs(modTOM),probs=0.8)
    setTxtProgressBar(pb, p)}
    close(pb)
    
    #¶à¸ötraitºÍmoduleÁªÏµÆðÀ´
    allTraits = read.csv("Trait.csv");#¶ÁÈ¡ÐÔ×´µÄ¹Øϵ
    femaleSamples=rownames(datExpr);
    traitRows = match(femaleSamples, allTraits$arrayname);
    datTraits = allTraits[traitRows, -1];
    nGenes = ncol(datExpr);
    nSamples = nrow(datExpr);
    moduleTraitCor = cor(MEs, datTraits, use = "p");
    moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
    sizeGrWindow(10,6)
    # Will display correlations and their p-values
    textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
    signif(moduleTraitPvalue, 1), ")", sep = "");
    dim(textMatrix) = dim(moduleTraitCor)
    par(mar = c(6, 8.5, 3, 3));
    # Display the correlation values within a heatmap plot
    labeledHeatmap(Matrix = moduleTraitCor,
    xLabels = names(datTraits),
    yLabels = names(MEs),
    ySymbols = names(MEs),
    colorLabels = FALSE,
    colors = greenWhiteRed(50),
    textMatrix = textMatrix,
    setStdMargins = FALSE,
    cex.text = 0.5,
    zlim = c(-1,1),
    main = paste("Module-trait relationships"))
    #¿ÉÒÔÊä³öÈÈͼÖеÄÊý¾Ý£¬textMatrix¿ÉÒÔɾȥ
    out2=data.frame(data.frame(moduleTraitCor,moduleTraitPvalue))
    write.table(out2, file="trait-module relationship.csv", row.names=TRUE, col.names = TRUE, sep=",")
    #corº¯Êý¿ÉÒÔÓÐuseºÍmethod²ÎÊýµÄÉèÖà : Missing values present in input data. Consider using use = 'pairwise.complete.obs'.
    ×¢Ò⣺´óСдÊDz»Í¬µÄ£¬ÌرðÊÇÎļþÃûµÈ£¡
    

    相关文章

      网友评论

        本文标题:WGCNA分析代码

        本文链接:https://www.haomeiwen.com/subject/zcookctx.html