美文网首页
跟着Nature Plants学数据分析:R语言WGCNA分析完

跟着Nature Plants学数据分析:R语言WGCNA分析完

作者: 小明的数据分析笔记本 | 来源:发表于2022-08-16 00:21 被阅读0次

    论文

    Plant flavones enrich rhizosphere Oxalobacteraceae to improve maize performance under nitrogen deprivation

    https://www.nature.com/articles/s41477-021-00897-y#data-availability

    https://github.com/PengYuMaize/Yu2021NaturePlants

    https://www.youtube.com/watch?v=dwWhm78j8YU 作者还录了视频介绍

    论文中提供的数据有100多个样本,这里我减少数据量,只用19个样本

    表达量的count文件

    image.png

    最后一列是基因长度,用来计算fpkm

    样本的分组文件

    image.png

    样本的表型

    image.png

    样本的表型论文中提供的数据也有很多,这里我只选取其中的一部分

    代码

    代码里有很多细节我也不明白,但是能够从前到后运行完,这里记录一下,细节有空再来研究吧

    加载需要的R包

    library(WGCNA)
    enableWGCNAThreads(nThreads = 6)
    #BiocManager::install("DESeq2")
    library(DESeq2)
    

    这里用到deseq2主要是用来计算fpkm的

    读取三个数据

    data0<-read.table("my_counts.csv",
                      header=T,
                      row.names=1,
                      sep=",")
    head(data0)
    sample_metadata <- read.csv(file = "sample_info.csv",
                                header = TRUE)
    head(sample_metadata)
    
    bac_traits<-read.csv("input_data/traits.csv",row.names = 1)
    head(bac_traits)
    

    利用count值计算fpkm并计算log2然后转置

    dataExpr_deseq <- DESeqDataSetFromMatrix(countData = data0[,-20],
                                             colData = sample_metadata,
                                             design = ~ Zone)
    mcols(dataExpr_deseq)$basepairs = data0$geneLength
    fpkm_matrix = fpkm(dataExpr_deseq)
    datExpr = t(log2(fpkm_matrix+1))
    
    dim(datExpr)
    

    对数据进行过滤

    nSamples = nrow(datExpr)  # 统计样品数目
    variancedatExpr=as.vector(apply(as.matrix(datExpr),2,var, na.rm=T))  #按列(基因)取方差
    no.missingdatExpr=as.vector(apply(is.na(as.matrix(datExpr)),2, sum) )#按列(基因)统计缺失数目
    KeepGenes= variancedatExpr>0 & no.missingdatExpr<0.1*nSamples  # 保留方差不等于0,且缺失低于10%的基因
    table(KeepGenes)# 过滤统计
    
    datExpr=datExpr[, KeepGenes]
    

    这一步原文的代码里没有,可能是我减少了样本的原因,如果没有这一步后面后有报错

    第一步是样本聚类树

    sampleTree = hclust(dist(datExpr), method = "average")
    pdf(file = "1-n-sampleClustering.pdf", width = 12, height = 9);
    par(cex = 0.5);
    par(mar = c(0,4,2,0))
    plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="",
         cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)
    dev.off()
    
    image.png

    第二步是筛选阈值

    powers = c(c(1:20), seq(from = 22, to=30, by=2))
    sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5) 
    # Scale-free topology fit index as a function of the soft-thresholding power
    pdf(file = "2-n-sft.pdf", width = 9, height = 5);
    par(mfrow = c(1,2));
    cex1 = 0.9;
    plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
         main = paste("Scale independence"));
    text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
         labels=powers,cex=cex1,col="red");
    # this line corresponds to using an R^2 cut-off of h
    abline(h=0.90,col="red") 
    # Mean connectivity as a function of the soft-thresholding power
    plot(sft$fitIndices[,1], sft$fitIndices[,5],
         xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
         main = paste("Mean connectivity")) 
    text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
    dev.off()
    
    image.png

    第三步是基因聚类树

    power=sft$powerEstimate
    power
    #TOM = TOMsimilarityFromExpr(datExpr, power = power) 这行代码要好长时间,需要比较大的内存
    ## long time big mem
    ## multiple cpu
    
    ## 这一步用linux系统的电脑设置多核心还挺快的 enableWGCNAThreads(nThreads = 4),
    ## 我设置12核心很快就得到了结果
    ## 但是我在自己的win系统上设置多核好像没有作用
    ## 我这边在linux系统运行完,把结果保存下来
    
    load("../TOM.rdata")
    dissTOM = 1-TOM 
    # Plot gene tree
    geneTree = hclust(as.dist(dissTOM), method = "average");
    pdf(file = "3-gene_cluster.pdf", width = 12, height = 9);
    plot(geneTree, 
         xlab="", 
         sub="", 
         main = "Gene clustering on TOM-based dissimilarity",
         labels = FALSE, 
         hang = 0.04);
    dev.off()
    
    image.png

    第四步是划分模块

    dynamicMods = cutreeDynamic(dendro = geneTree, 
                                distM = dissTOM,
                                deepSplit = 2, 
                                pamRespectsDendro = FALSE,
                                minClusterSize = 30)
    
    dynamicColors = labels2colors(dynamicMods)
    
    pdf(file = "4-module_tree.pdf", width = 8, height = 6);
    plotDendroAndColors(geneTree, 
                        dynamicColors, 
                        "Dynamic Tree Cut",
                        dendroLabels = FALSE,
                        hang = 0.03,
                        addGuide = TRUE, 
                        guideHang = 0.05,
                        main = "Gene dendrogram and module colors")
    dev.off()
    
    MEDissThres=0.25
    
    merge = mergeCloseModules(datExpr, 
                              dynamicColors, 
                              cutHeight = MEDissThres, 
                              verbose = 3) 
    mergedColors = merge$colors  
    mergedMEs = merge$newMEs  
    # Plot merged module tree
    pdf(file = "5-merged_Module_Tree.pdf", width = 12, height = 9)  
    plotDendroAndColors(geneTree, 
                        cbind(dynamicColors, mergedColors), 
                        c("Dynamic Tree Cut", "Merged dynamic"), 
                        dendroLabels = FALSE, 
                        hang = 0.03, 
                        addGuide = TRUE, 
                        guideHang = 0.05)  
    dev.off()
    write.table(merge$oldMEs,file="oldMEs.txt");
    write.table(merge$newMEs,file="newMEs.txt");
    
    image.png

    第五步是表型和模块的相关性

    nGenes = ncol(datExpr);
    nSamples = nrow(datExpr);
    # Recalculate MEs with color labels
    MEs0 = moduleEigengenes(datExpr, mergedColors)$eigengenes
    MEs = orderMEs(MEs0)
    
    table(rownames(MEs) == rownames(bac_traits))
    moduleTraitCor = cor(MEs, bac_traits, use = "p");
    moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
    write.table(moduleTraitCor,file="moduleTrait_correlation.txt");
    write.table(moduleTraitPvalue,file="moduleTrait_pValue.txt");
    
    textMatrix =  paste(signif(moduleTraitCor, 2), "\n(",
                        signif(moduleTraitPvalue, 1), ")", sep = "");
    dim(textMatrix) = dim(moduleTraitCor)
    pdf("module-traits-bacteria-order.pdf", width = 10, height = 10)
    par(mar = c(15, 12, 5, 5));
    # Display the correlation values within a heatmap plot
    labeledHeatmap(Matrix = moduleTraitCor,
                   xLabels = names(bac_traits),
                   yLabels = names(MEs),
                   ySymbols = names(MEs),
                   colorLabels = FALSE,
                   colors = greenWhiteRed(50),
                   textMatrix = textMatrix,
                   setStdMargins = FALSE,
                   cex.text = 0.5,
                   zlim = c(-1,1),
                   main = paste("Module-trait relationships"))
    dev.off()
    
    image.png image.png

    欢迎大家关注我的公众号

    小明的数据分析笔记本

    小明的数据分析笔记本 公众号 主要分享:1、R语言和python做数据分析和数据可视化的简单小例子;2、园艺植物相关转录组学、基因组学、群体遗传学文献阅读笔记;3、生物信息学入门学习资料及自己的学习笔记!

    相关文章

      网友评论

          本文标题:跟着Nature Plants学数据分析:R语言WGCNA分析完

          本文链接:https://www.haomeiwen.com/subject/czqrgrtx.html