WGCNA

作者: 生信编程日常 | 来源:发表于2020-02-25 23:27 被阅读0次

    加权基因共表达网络分析 (WGCNA, Weighted correlation network analysis)是用来描述不同样品之间基因关联模式的系统生物学方法,可以用来鉴定高度协同变化的基因集,
    测试数据下载地址:https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/FemaleLiver-Data.zip

    source("https://bioconductor.org/biocLite.R")
    biocLite(c("AnnotationDbi", "impute","GO.db", "preprocessCore"))
    install.packages(c("WGCNA", "stringr", "reshape2"), repos="https://mirrors.tuna.tsinghua.edu.cn/CRAN")
    library(WGCNA)
    options(stringsAsFactors = FALSE);
    #开多线程,但是Mac上似乎这一步会报错,可不做
    enableWGCNAThreads()
    library(WGCNA);
    options(stringsAsFactors = FALSE);
    

    Data input, cleaning and pre-processing

    #制作基因表达矩阵
    #Read in the female liver data set
    femData = read.csv("Documents/project/test/FemaleLiver-Data/LiverFemale3600.csv");
    datExpr0 = as.data.frame(t(femData[, -c(1:8)]));
    names(datExpr0) = femData$substanceBXH;
    
    
    ###如果基因多的话,可以筛选变化大的基因,减小计算量,也可不做这一步
    m.mad <- apply(dataExpr0,1,mad)
    dataExpr0 <- dataExpr0[which(m.mad > 
                     max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
    
    gsg = goodSamplesGenes(datExpr0, verbose = 3);
    gsg$allOK
    

    如果有需要去除的样本和基因的话,执行这一步,ok的话跳过这一步

    if (!gsg$allOK)
    {
      # Optionally, print the gene and sample names that were removed:
      if (sum(!gsg$goodGenes)>0) 
         printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")));
      if (sum(!gsg$goodSamples)>0) 
         printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
      # Remove the offending genes and samples from the data:
      datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
    }
    

    将样本聚类检查是否有离群样本,检测outlier

    sampleTree = hclust(dist(datExpr0), method = "average");
    plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
         cex.axis = 1.5, cex.main = 2)
    

    这样的样本有outlier,需要cut掉



    只留下需要的sample

    clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)
    table(clust)
    # clust 1 contains the samples we want to keep.
    keepSamples = (clust==1)
    datExpr = datExpr0[keepSamples, ]
    nGenes = ncol(datExpr)
    nSamples = nrow(datExpr)
    sampleTree = hclust(dist(datExpr), method = "average");
    plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
         cex.axis = 1.5, cex.main = 2)
    
    image.png

    最后完成的基因表达矩阵


    image.png

    制作表达矩阵对应的特征矩阵

    traitData = read.csv("Documents/project/test/FemaleLiver-Data/ClinicalTraits.csv");
    dim(traitData)
    names(traitData)
    
    # remove columns that hold information we do not need.
    allTraits = traitData[, -c(31, 16)];
    allTraits = allTraits[, c(2, 11:36) ];
    # Form a data frame analogous to expression data that will hold the clinical traits.
    femaleSamples = rownames(datExpr);
    traitRows = match(femaleSamples, allTraits$Mice);
    datTraits = allTraits[traitRows, -1];
    rownames(datTraits) = allTraits[traitRows, 1];
    

    完成的特征矩阵


    image.png

    ############################################################

    2.Network construction and module detection

    2.1Automatic, one-step network construction and module detection

    #软阈值筛选##
    # Choose a set of soft-thresholding powers
    powers = c(c(1:10), seq(from = 12, to=20, by=2))
    # Call the network topology analysis function
    sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
    # Plot the results:
    sizeGrWindow(9, 5)
    par(mfrow = c(1,2));
    cex1 = 0.9;
    # Scale-free topology fit index as a function of the soft-thresholding power
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
        main = paste("Scale independence"));
    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
        labels=powers,cex=cex1,col="red");
    # this line corresponds to using an R^2 cut-off of h
    abline(h=0.90,col="red")
    # Mean connectivity as a function of the soft-thresholding power
    plot(sft$fitIndices[,1], sft$fitIndices[,5],
        xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
        main = paste("Mean connectivity"))
    text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
    
    image.png

    参考:
    http://blog.genesino.com/2018/04/wgcna/#wgcna%E5%9F%BA%E6%9C%AC%E6%A6%82%E5%BF%B5

    https://www.jianshu.com/p/3618f5ff3eb0

    相关文章

      网友评论

        本文标题:WGCNA

        本文链接:https://www.haomeiwen.com/subject/qmsgqhtx.html