背景:在umap聚类的基础上,进行WGCNA的分析。
- WGCNA
(1)两个部分:表达量聚类分析和表型关联
(2)四个步骤:基因之间相关系数计算、基因模块的确定、共表达网络、模块与性状关联 - 预处理:将原始细胞基因表达矩阵转换为伪细胞基因表达矩阵
所谓的伪细胞就是将原来10(size=10)个细胞当作是1个细胞来分析,当然size是可以改变的。
这里写一个过程:
library(Seurat)
library(dplyr)
sc<-readRDS("./sc_umap.rds")
sc<-sc
#' 构建原始细胞名字和伪细胞名字的对照表格cell.id_table
old_cell.id<-c()
new_cell.id<-c()
for(i in unique(sc$seurat_clusters)){
sub_meta.data<-sc@meta.data[sc@meta.data$seurat_clusters==i,]
sub_cell.id<-rownames(sub_meta.data)
int_num<-floor(length(sub_cell.id)/10)
remain_num<-length(sub_cell.id)%%10
for(j in 1:int_num){
sub_new<-rep(paste0(i,"_cell_",j),10)
new_cell.id<-c(new_cell.id,sub_new)
}
sub_new<-rep(paste0(i,"_cell_",j+1),remain_num)
new_cell.id<-c(new_cell.id,sub_new)
old_cell.id<-c(old_cell.id,sample(sub_cell.id)) #'这里sample函数相当重要,它打乱了原始的细胞顺序,增加随机性
}
cell.id_table<-data.frame(new.id=new_cell.id,old.id=old_cell.id)
head(cell.id_table)
cell.id_table.png
table(cell.id_table$new.id)
可以看见每个伪细胞里其实包含了原始的10个细胞:table.png
#' 提取原始seurat对象的标准化表达矩阵
t.counts<-data.table(t(as.matrix(slot(sc@assays[["RNA"]],"data"))),keep.rownames=T) #'这样提取避免行名是字符串
rownames(cell.id_table)<-cell.id_table$old.id
cell.id_table<-cell.id_table[t.counts$rn,]
t.counts$rn<-cell.id_table$new.id
t.counts<- t.counts[,lapply(.SD,mean), by=rn] #’根据rn那一列的值,合并基因表达量,mean代表使用平均表达量
gene.name <- t.counts$rn
cell.id <- rownames(t.counts)
my.counts <- t(t.counts[,rn:=NULL])
colnames(my.counts)<-cell.id)
new.sc<- CreateSeuratObject(my.counts)
saveRDS(new.sc,"pseudo_sc.rds")
my.counts.png
可以看到细胞名字已经变成了伪细胞的名字。
- WGCNA分析
library(WGCNA)
options(stringsAsFactors = FALSE)
enableWGCNAThreads() #' 打开多线程
datExpr0<-as.data.frame(t(my.counts)) #' 行为伪细胞,列为基因
#' 检查缺失值
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK #' 这个值为TRUE就没问题,反之进行以下筛选
if (!gsg$allOK){
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))
# Remove the offending genes and samples from the data:
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
这里也许过滤了一些细胞或者基因,接下来进一步过滤基因:
#' 过滤,根据阈值筛选基因列
meanFPKM= 0.5 #' 过滤标准,可以修改
n=nrow(datExpr0)
datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean) #' 计算每一列基因的平均值
datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM] #' 只保留平均值大于阈值的列
# for meanFpkm in row n+1 and it must be above what you set--select meanFpkm>opt$meanFpkm(by rp)
filtered_fpkm=t(datExpr0) #' 转置,变为行为基因,列为细胞
filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm) #' 转换为数据框
names(filtered_fpkm)[1]="sample"
head(filtered_fpkm)
write.table(filtered_fpkm, file="mRNA.symbol.uniq.filter.txt",
row.names=F, col.names=T,quote=FALSE,sep="\t")
nGenes = ncol(datExpr0)
nSamples = nrow(datExpr0)
head(filtered_fpkm).png
接下来过滤一些细胞:
#' 聚类,对伪细胞进行聚类,筛选伪细胞,去除聚类关系上较远的伪细胞
sampleTree = hclust(dist(datExpr0), method = "average")
pdf(file = "sampleClustering_fig1.pdf", width = 15, height = 8)
par(cex = 0.6)
par(mar = c(0,6,6,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,
cex.axis = 1.5, cex.main = 2)
### Plot a line to show the cut
#abline(h = 180, col = "red")##剪切高度不确定,故无红线
dev.off()
sampleclustering.png
接下来是较为关键的一步,寻找合适的软阈值power:
#' 寻找软阈值power,power是对相关性的值进行幂次运算(所谓加权网络的边)的值
#' abs(cor(geneX, geneY)) ^ power
powers=c(c(1:10),seq(from = 12, to = 20, by = 2))
sft = pickSoftThreshold(datExpr0, powerVector = powers, verbose = 5)
#' 可视化power,拐点位置就是合适的值
pdf('powers_fig2.pdf',18,10)
par(mfrow = c(1,2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
abline(h=0.85,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
cex=cex1, col="red")
dev.off()
powers_fig2.png
通常,选取在拐点处的值为软阈值,这里power=4。
#' 获取预测的power值,如果没有则根据以下规则获取
power = sft$powerEstimate
if (is.na(power)){
power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
ifelse(type == "unsigned", 6, 12))
)
)
}
接下来计算基因模块:
#' 确定模块modules
#' TOMType = "unsigned"构建无尺度网络;minModuleSize = 30最小模块基因数为30;
#' TOM矩阵(topological overlap matrix,TOM拓扑重叠矩阵):数值都在[0,1]间,
#' 是用于反应两两基因共表达的相似度,值越高,说明共表达相似度越高
cor = WGCNA::cor #' 指定使用WGCNA包中的相关性计算函数
net = blockwiseModules(datExpr0, power = power, maxBlockSize = nGenes,
TOMType = "unsigned", minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs=TRUE, corType = "pearson",
maxPOutliers=1, loadTOMs=TRUE,
saveTOMFileBase = paste0("first", ".tom"),
verbose = 3)
cor = stats::cor
table(net$colors)
#' TOM矩阵的层次聚类及模块颜色结果可视化
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels) #' 将标签转换为颜色color
pdf('modules_fig3.pdf',18,10)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
modules_fig3.png
这个图呢,上面部分是基因聚类,下面颜色部分是不同的基因模块。
接下来绘制校正矩阵的热图:
#' 校正矩阵热图
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
pdf('adjacency_fig4.pdf',18,10)
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2), plotDendrograms = T,
xLabelsAngle = 90)
dev.off()
adjacency_fig4.png
模块与模块之间相关性可视化:
#’ TOM矩阵热图可视化,模块与模块之间的相关性
load(net$TOMFiles[1], verbose=T)
TOM <- as.matrix(TOM)
dissTOM = 1-TOM
plotTOM = dissTOM^7
diag(plotTOM) = NA
pdf('TOM_fig5.pdf',18,10)
TOMplot(plotTOM, net$dendrograms, moduleColors,
main = "Network heatmap plot, all genes")
dev.off()
TOM_fig5.png
绘制每一个模块中关键基因的网状图:
#' 模块里的关键基因
probes = colnames(datExpr0)
dimnames(TOM) <- list(probes, probes)
cyt = exportNetworkToCytoscape(TOM,
edgeFile = paste("first", ".edges.txt", sep=""),
nodeFile = paste("first", ".nodes.txt", sep=""),
weighted = TRUE, threshold = 0,
nodeNames = probes, nodeAttr = moduleColors)
#' 可视化每个模块的关键基因的网络图,去掉了grey颜色的模块
edges <- read.table(paste("first", ".edges.txt", sep=""),sep='\t',header=T)
nodes <- read.table(paste("first", ".nodes.txt", sep=""),sep='\t',header=T)
library(RColorBrewer)
coul<-c(brewer.pal(8, "Dark2"),brewer.pal(12, "Paired"),brewer.pal(8, "Set2"),brewer.pal(9, "Set1"))
colnames(nodes) <- c(colnames(nodes)[1:2],'group')
lgroup <- unique(nodes$group)
lgroup <- lgroup[-grep('grey',lgroup)]
for (i in lgroup){
nodes1 <- subset(nodes,group==i)
c1 <- which(edges[,1] %in% nodes1[,1])
edges1 <- edges[c1,]
c1 <- which(edges1[,2] %in% nodes1[,1])
edges1 <- edges1[c1,]
mtitle <- dim(nodes1)[1]
plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight=NULL,coul = coul,layout=layout.sphere,pf=i,main=mtitle)
}
#' 筛选HUG基因,并绘制每个模块中的hug基因网络图
library(dplyr)
edgeData1 <- cyt$edgeData[,c(1,2,3)]
edgeData2 <- cyt$edgeData[,c(2,1,3)]
nodeattrib <- cyt$nodeData[,c(1,3)]
colnames(nodeattrib) <- c("nodeName", "Module")
colnames(edgeData1) <- c("Node1","Node2","Weight")
colnames(edgeData2) <- c("Node1","Node2","Weight")
edgeData <- rbind(edgeData1, edgeData2)
edgeData$Module1 <- nodeattrib[match(edgeData$Node1, nodeattrib$nodeName), 2]
edgeData$Module2 <- nodeattrib[match(edgeData$Node2, nodeattrib$nodeName), 2]
edgeData <- edgeData[edgeData$Module1==edgeData$Module2,c(1,3,4)] #'同一个模块内的基因网络信息
nodeTotalWeight <- edgeData %>% group_by(Node1, Module1) %>% summarise(weight=sum(Weight))
nodeTotalWeight <- nodeTotalWeight[with(nodeTotalWeight, order(Module1, -weight)),]
nodeTotalWeightTop = nodeTotalWeight %>% group_by(Module1) %>% top_n(10, weight) %>% data.frame()
write.table(nodeTotalWeightTop,'hug_gene_list.xls',sep='\t',quote=F)
lgroup <- unique(nodeTotalWeightTop$Module1)
lgroup <- lgroup[-grep('grey',lgroup)]
for (i in lgroup) {
nodes1 <- subset(nodeTotalWeightTop,Module1==i)
c1 <- which(edges[,1] %in% nodes1[,1])
edges1 <- edges[c1,]
c1 <- which(edges1[,2] %in% nodes1[,1])
edges1 <- edges1[c1,]
c1 <- which(nodes[,1] %in% nodes1[,1])
nodes1 <- nodes[c1,]
nodes1 <- subset(nodes1,group==i)
mtitle <- dim(nodes1)[1]
plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight='weight',coul = coul,layout=layout.sphere,pf=paste0("hug_genes_",i),main=mtitle)
}
接下来是将性状信息与基因模块关联起来的一些分析:
#' 计算基因模块与性状相关性
#' 需要性状相关的数据框信息
trait=new.sc@meta.data #' 性状相关的数据框信息
#trait=trait[,-1]
corType = "pearson"
robustY = ifelse(corType=="pearson",T,F)
if(!is.null(trait)) {
traitData = trait
sampleName = rownames(datExpr0)
traitData = traitData[match(sampleName, rownames(traitData)), ]
if (corType=="pearsoon") {
modTraitCor = cor(MEs_col, traitData, use = "p")
modTraitP = corPvalueStudent(modTraitCor, nSamples)
} else {
modTraitCorP = bicorAndPvalue(MEs_col, traitData, robustY=robustY)
modTraitCor = modTraitCorP$bicor
modTraitP = modTraitCorP$p
}
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
pdf('corMetaGene_fig6.pdf',18,10)
labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(traitData),
yLabels = colnames(MEs_col),
cex.lab = 0.5,
ySymbols = colnames(MEs_col), colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix, setStdMargins = FALSE,
cex.text = 0.5, zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
}
module='brown'
pheno='orig.ident'
if (corType=="pearsoon") {
geneModuleMembership = as.data.frame(cor(datExpr0, MEs_col, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(
as.matrix(geneModuleMembership), nSamples))
}else {
geneModuleMembershipA = bicorAndPvalue(datExpr0, MEs_col, robustY=robustY)
geneModuleMembership = geneModuleMembershipA$bicor
MMPvalue = geneModuleMembershipA$p
}
if (corType=="pearsoon") {
geneTraitCor = as.data.frame(cor(datExpr0, traitData, use = "p"))
geneTraitP = as.data.frame(corPvalueStudent(
as.matrix(geneTraitCor), nSamples))
} else {
geneTraitCorA = bicorAndPvalue(datExpr0, traitData, robustY=robustY)
geneTraitCor = as.data.frame(geneTraitCorA$bicor)
geneTraitP = as.data.frame(geneTraitCorA$p)
}
modNames = substring(colnames(MEs_col), 3)
module_column = match(module, modNames)
pheno_column = match(pheno,colnames(traitData))
moduleGenes = moduleColors == module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
pdf(paste0(module,"_",pheno,'_verboseScatterplot_fig7.pdf'),18,10)
verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),
abs(geneTraitCor[moduleGenes, pheno_column]),
xlab = paste("Module Membership in", module, "module"),
ylab = paste("Gene significance for", pheno),
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()
总结:是稍微复杂了一点。
网友评论