现在想要做一下dc细胞类型的WGCNA,参考教程是周运来小红书,大家可以去找一下,我的笔记如下:
# 37.WGCNA of DC and MA -------------------------------------------------
rm(list=ls())
setwd('/data1/nyy/HGPS_scRNAseq/seurat/integrate/cell_tyle/DC_WGCNA')
library(WGCNA)
library(Seurat)
library(tidyverse)
library(reshape2)
library(stringr)
library(dplyr)
load('/data1/nyy/HGPS_scRNAseq/seurat/integrate/Rdata/LS.sample.FindClusters.sub.cluster.group.sample.abb.RData')
names(sample.integrated@meta.data)
cell.type <- sample.integrated@meta.data[,c(1,20,24)]
table(cell.type$cell.type)
dc <- rownames(cell.type[cell.type$cell.type=='Myeloid dendritic cells',])
data <- as.data.frame(sample.integrated@assays$RNA@data)
dc_data <- data[,which(names(data)%in%dc)]#18060
head(dc_data[1:4,1:4])
dim(dc_data)#18060 280 这就是表达谱
# 进一步筛选细胞和基因
m.mad <- apply(dc_data,1,mad)
head(m.mad)
#dataExprVar <- dataExpr[which(m.mad >
# max(quantile(m.mad, probs=seq(0, 1, 0.25))[2],0.01)),]
dataExprVar <- dc_data[which(m.mad > 0),]#2039
## 转换为样品在行,基因在列的矩阵
dataExpr <- as.data.frame(t(dataExprVar))
dim(dataExpr)#280 2039
head(dataExpr)[,1:8]
## 检测缺失值
gsg = goodSamplesGenes(dataExpr, verbose = 3)
gsg$allOK
gsg$goodSamples
if (!gsg$allOK){
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:",
paste(names(dataExpr)[!gsg$goodGenes], collapse = ",")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:",
paste(rownames(dataExpr)[!gsg$goodSamples], collapse = ",")));
# Remove the offending genes and samples from the data:
dataExpr = dataExpr[gsg$goodSamples, gsg$goodGenes]
}
nGenes = ncol(dataExpr)
nSamples = nrow(dataExpr)
dim(dataExpr)#280 2039
## 查看是否有离群样品
sampleTree = hclust(dist(dataExpr), method = "average")
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")#这要是几千个细胞的话,简直一团黑
真的是一团黑
powers = c(c(1:10), seq(from = 12, to=30, by=2))
sft = pickSoftThreshold(dataExpr, powerVector=powers,
networkType="signed", verbose=5)
par(mfrow = c(1,2))
cex1 = 0.9
# 横轴是Soft threshold (power),纵轴是无标度网络的评估参数,数值越高,
# 网络越符合无标度特征 (non-scale)
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
# 筛选标准。R-square=0.85
abline(h=0.85,col="red")
# Soft threshold与平均连通性
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
cex=cex1, col="red")
power = sft$powerEstimate
softPower = power
softPower#9
#参数beta取值默认是1到30,上述图形的横轴均代表权重参数β,左图纵轴代表对应的网络中log(k)与log(p(k))相关系数的平方。相关系数的平方越高,说明该网络越逼近无网路尺度的分布。右图的纵轴代表对应的基因模块中所有基因邻接函数的均值。最佳的beta值就是sft$powerEstimate,已经被保存到变量了,不需要知道具体是什么,后面的代码都用这个即可,在本例子里面是9。
最后画出来的图
#即使你不理解它,也可以使用代码拿到合适“软阀值(soft thresholding power)”beta进行后续分析。
# 无向网络在power小于15或有向网络power小于30内,没有一个power值可以使
# 无标度网络图谱结构R^2达到0.8,平均连接度较高如在100以上,可能是由于
# 部分样品与其他样品差别太大。这可能由批次效应、样品异质性或实验条件对
# 表达影响太大等造成。可以通过绘制样品聚类查看分组信息和有无异常样品。
# 如果这确实是由有意义的生物变化引起的,也可以使用下面的经验power值。
if (is.na(power)){
power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
ifelse(type == "unsigned", 6, 12))
)
)
}
power
#一步法网络构建:One-step network construction and module detection##
# power: 上一步计算的软阈值
# maxBlockSize: 计算机能处理的最大模块的基因数量 (默认5000);
# 4G内存电脑可处理8000-10000个,16G内存电脑可以处理2万个,32G内存电脑可
# 以处理3万个
# 计算资源允许的情况下最好放在一个block里面。
# corType: pearson or bicor
# numericLabels: 返回数字而不是颜色作为模块的名字,后面可以再转换为颜色
# saveTOMs:最耗费时间的计算,存储起来,供后续使用
# mergeCutHeight: 合并模块的阈值,越大模块越少
# type = unsigned
cor <- WGCNA::cor
net = blockwiseModules(dataExpr, power = power, maxBlockSize = nGenes,#nGenes
+ TOMType = "unsigned", minModuleSize = 10,
+ reassignThreshold = 0, mergeCutHeight = 0.25,
+ numericLabels = TRUE, pamRespectsDendro = FALSE,
+ saveTOMs=TRUE, corType = 'pearson',
+ loadTOMs=TRUE,
+ saveTOMFileBase = paste0("dataExpr", ".tom"),
+ verbose = 3)
Calculating module eigengenes block-wise from all genes
Flagging genes and samples with too many missing values...
..step 1
..Working on block 1 .
TOM calculation: adjacency..
..will not use multithreading.
Fraction of slow calculations: 0.000000
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
..saving TOM for block 1 into file dataExpr.tom-block.1.RData
....clustering..
....detecting modules..
....calculating module eigengenes..
....checking kME in modules..
..merging modules that are too close..
mergeCloseModules: Merging modules whose distance is less than 0.25
Calculating new MEs...
table(net$colors)
net$unmergedColors
head(net$MEs)
net$goodSamples
net$goodGenes
net$TOMFiles
net$blockGenes
net$blocks
net$MEsOK
head(net$MEs)
table(net$colors)
#这里用不同的颜色来代表那些所有的模块,其中灰色默认是无法归类于任何模块的那些基因,如果灰色模块里面的基因太多,那么前期对表达矩阵挑选基因的步骤可能就不太合适。
## 灰色的为**未分类**到模块的基因。
# Convert labels to colors for plotting
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
moduleColors
# Plot the dendrogram and the module colors underneath
# 如果对结果不满意,还可以recutBlockwiseTrees,节省计算时间
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
灰色的也太多了……
让我做下去的原因是还有不是灰色的,哈哈哈哈,全程代码都很好用,直接点点点就行,不会报错
#这里用不同的颜色来代表那些所有的模块,其中灰色默认是无法归类于任何模块的那些基因,如果灰色模块里面的基因太多,那么前期对表达矩阵挑选基因的步骤可能就不太合适。
## 灰色的为**未分类**到模块的基因。
# Convert labels to colors for plotting
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
moduleColors
# Plot the dendrogram and the module colors underneath
# 如果对结果不满意,还可以recutBlockwiseTrees,节省计算时间
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
# module eigengene, 可以绘制线图,作为每个模块的基因表达趋势的展示
MEs = net$MEs
### 不需要重新计算,改下列名字就好
### 官方教程是重新计算的,起始可以不用这么麻烦
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(
as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
# 根据基因间表达量进行聚类所得到的各模块间的相关性图
# marDendro/marHeatmap 设置下、左、上、右的边距
head(MEs_col)
?plotEigengeneNetworks
plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2),
plotDendrograms = T,
xLabelsAngle = 90)
image.png
网友评论