共表达基因网络分析
构建假细胞的函数
library(Seurat)
library(data.table)
library(dplyr)
condense_cell <- function(obj,group.by=NULL,size=NULL,slot='data',assay='RNA',fun=mean){
mat <- data.table(t(as.matrix(slot(obj@assays[[assay]],slot))),keep.rownames=T)
meta <- obj@meta.data
meta$Celltype <- meta[,group.by]
cellist <- rownames(meta)
lcod <- c()
lscell <- c()
for (i in unique(meta$Celltype)) {
c1 <- which(meta$Celltype==i)
cellist1 <- cellist[c1]
cod <- floor(seq_along(cellist1)/size)
t1 <- data.frame(table(cod))
t1[,1] <- as.vector(t1[,1])
t1[,2] <- as.vector(t1[,2])
c2 <- which(t1[,2]<size)
short <-t1[,1][c2]
c3 <- which(cod %in% short)
cod[c3] <- t1[,1][which(t1[,2]>=size)[1]]
cod <- paste0(i,"_Cell",cod)
scell <- sample(cellist1)
lcod <- c(lcod,cod)
lscell <- c(lscell,scell)
}
map_df <- data.frame(lscell,lcod)
colnames(map_df) <- c("old_id","new_id")
rownames(map_df) <- map_df$old_id
map_df <- map_df[mat$rn,]
mat$rn <- map_df$new_id
mat <- mat[,lapply(.SD,fun), by=rn]
rownames(mat) <- mat$rn
gename <- colnames(mat)
celname <- rownames(mat)
mat <- t(mat[,rn:=NULL])
colnames(mat) <- celname
rownames(mat) <- gename
obj <- CreateSeuratObject(mat)
return(obj)
}
画网络图的函数
library(igraph)
plot_network <- function(edges,nodes=NULL,group.by=NULL,weight=NULL,coul = NULL,layout=layout.sphere,pf=NULL,main=""){
if (!is.null(weight)) {
edges$weight <- edges[,weight]
}else{
edges$weight <- 1
}
if (!is.null(group.by)) {
nodes$group <- nodes[,group.by]
}else{
nodes$group <- 'group1'
}
network <- graph_from_data_frame(d=edges, vertices=nodes, directed=F)
library(RColorBrewer)
nlen <- length(unique(V(network)$group))
if (is.null(coul)) {
library(RColorBrewer)
coul <- c(brewer.pal(8, "Dark2"),brewer.pal(12, "Paired"),brewer.pal(8, "Set2"),brewer.pal(9, "Set1"))
}
coul <- coul[1:nlen]
vertex.color <- coul[as.numeric(as.factor(V(network)$group))]
pdf(paste0(group.by,'_',pf,'_network.pdf'),18,10)
plot(network, vertex.color=vertex.color, edge.width=E(network)$weight*2,layout=layout,main=main)
legend("topright", legend=levels(as.factor(V(network)$group)),
col = coul, bty = "n", pch=20 , pt.cex = 3,
cex = 1.5, text.col=coul , horiz = FALSE,
inset = c(0.1, 0.1))
dev.off()
}
读取数据并构建假细胞,需要选择归一化的矩阵,故选择data
mat <- Read10X('scWGCNA/filtered_gene_bc_matrices/hg19')
ob1 <- CreateSeuratObject(mat)
构建表型数据,即生成无监督聚类类群列seurat_clusters作为假的表型数据,进行简单聚类,这里用到我之前写的一个函数seob_cluster,其实就是进行标准流程的聚类即可。
ob1 <- seob_cluster(ob1)
ob1 <- condense_cell(ob1,group.by='seurat_clusters',size=10,slot='data',assay='RNA',fun=mean)
数据QC
加载安装包
#Load required packages
library(WGCNA)
library(reshape2)
library(stringr)
options(stringsAsFactors = FALSE)
开启多线程
#Enable mutithreads
enableWGCNAThreads()
基本参数设置
type,官方推荐 "signed" 或 "signed hybrid",官网教程用的unsigned
corType,官网推荐pearson或bicor
corFnc,相关性计算使用的函数,pearson用cor函数,bicor用bicor函数
maxPOutliers和robustY,对二元变量,如样本性状信息计算相关性时,或基因表达严重依赖于疾病状态时,需设置的参数
type = "unsigned"
corType = "pearson"
corFnc = ifelse(corType=="pearson", cor, bicor)
maxPOutliers = ifelse(corType=="pearson",1,0.05)
robustY = ifelse(corType=="pearson",T,F)
获取压缩好的矩阵
mat <- ob1@assays$RNA@counts
dataExpr <- mat
根据MAD值筛选基因,默认筛选前75%的基因,并且MAD值需大于0.01的基因。
pct.mad <- 0.75
m.mad <- apply(dataExpr,1,mad)
dataExprVar <- as.matrix(dataExpr[which(m.mad > max(quantile(m.mad, probs=seq(0, 1, 1-pct.mad))[2],0.01)),])
dataExpr <- as.data.frame(t(dataExprVar))
筛选基因和样本
gsg = goodSamplesGenes(dataExpr, verbose = 3)
if (!gsg$allOK){
# Optionally, print the gene and sample names that were removed:
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:",
paste(names(dataExpr)[!gsg$goodGenes], collapse = ",")));
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:",
paste(rownames(dataExpr)[!gsg$goodSamples], collapse = ",")));
# Remove the offending genes and samples from the data:
dataExpr = dataExpr[gsg$goodSamples, gsg$goodGenes]
}
nGenes = ncol(dataExpr)
nSamples = nrow(dataExpr)
可视化筛选后的样本层级聚类树
sampleTree = hclust(dist(dataExpr), method = "average")
pdf('fig1.sampleTree.pdf',18,10)
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="")
dev.off()
计算软阈值power
powers = c(c(1:10), seq(from = 12, to=30, by=2))
sft = pickSoftThreshold(dataExpr, powerVector=powers,networkType=type, verbose=5)
可视化不同取值的power
pdf('fig2.powers.pdf',18,10)
par(mfrow = c(1,2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red")
abline(h=0.85,col="red")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,
cex=cex1, col="red")
dev.off()
获取估计的power
power = sft$powerEstimate
如果估计的power值为NA则可以根据经验赋值
if (is.na(power)){
power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18),
ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16),
ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14),
ifelse(type == "unsigned", 6, 12))
)
)
}
构建共表达网络
label='test'
net = blockwiseModules(dataExpr, power = power, maxBlockSize = nGenes,
TOMType = type, minModuleSize = 30,
reassignThreshold = 0, mergeCutHeight = 0.25,
numericLabels = TRUE, pamRespectsDendro = FALSE,
saveTOMs=TRUE, corType = corType,
maxPOutliers=maxPOutliers, loadTOMs=TRUE,
saveTOMFileBase = paste0(label, ".tom"),
verbose = 3)
可视化基因共表达网络树,灰色表示没有纳入模块的基因
moduleLabels = net$colors
moduleColors = labels2colors(moduleLabels)
pdf('fig3.plotDendroAndColors.pdf',18,10)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
可视化基因模块
MEs = net$MEs
MEs_col = MEs
colnames(MEs_col) = paste0("ME", labels2colors(
as.numeric(str_replace_all(colnames(MEs),"ME",""))))
MEs_col = orderMEs(MEs_col)
pdf('fig4.plotEigengeneNetworks.pdf',18,10)
plotEigengeneNetworks(MEs_col, "Eigengene adjacency heatmap",
marDendro = c(3,3,2,4),
marHeatmap = c(3,4,2,2), plotDendrograms = T,
xLabelsAngle = 90)
dev.off()
可视化基因调控网络,太耗时间了,建议不画
load(net$TOMFiles[1], verbose=T)
TOM <- as.matrix(TOM)
dissTOM = 1-TOM
plotTOM = dissTOM^7
diag(plotTOM) = NA
pdf('fig5.TOMplot.pdf',18,10)
TOMplot(plotTOM, net$dendrograms, moduleColors,
main = "Network heatmap plot, all genes")
dev.off()
data:image/s3,"s3://crabby-images/e33d0/e33d061201c0bbb68f80c14322adef233e5b1627" alt=""
导出共表达网络
probes = colnames(dataExpr)
dimnames(TOM) <- list(probes, probes)
cyt = exportNetworkToCytoscape(TOM,
edgeFile = paste(label, ".edges.txt", sep=""),
nodeFile = paste(label, ".nodes.txt", sep=""),
weighted = TRUE, threshold = 0,
nodeNames = probes, nodeAttr = moduleColors)
共表达网络图
edges <- read.table(paste(label, ".edges.txt", sep=""),sep='\t',header=T)
nodes <- read.table(paste(label, ".nodes.txt", sep=""),sep='\t',header=T)
library(RColorBrewer)
coul<-c(brewer.pal(8, "Dark2"),brewer.pal(12, "Paired"),brewer.pal(8, "Set2"),brewer.pal(9, "Set1"))
colnames(nodes) <- c(colnames(nodes)[1:2],'group')
lgroup <- unique(nodes$group)
lgroup <- lgroup[-grep('grey',lgroup)]
for (i in lgroup) {
nodes1 <- subset(nodes,group==i)
c1 <- which(edges[,1] %in% nodes1[,1])
edges1 <- edges[c1,]
c1 <- which(edges1[,2] %in% nodes1[,1])
edges1 <- edges1[c1,]
mtitle <- dim(nodes1)[1]
plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight=NULL,coul = coul,layout=layout.sphere,pf=i,main=mtitle)
}
筛选hug基因
library(dplyr)
edgeData1 <- cyt$edgeData[,c(1,2,3)]
edgeData2 <- cyt$edgeData[,c(2,1,3)]
nodeattrib <- cyt$nodeData[,c(1,3)]
colnames(nodeattrib) <- c("nodeName", "Module")
colnames(edgeData1) <- c("Node1","Node2","Weight")
colnames(edgeData2) <- c("Node1","Node2","Weight")
edgeData <- rbind(edgeData1, edgeData2)
edgeData$Module1 <- nodeattrib[match(edgeData$Node1, nodeattrib$nodeName), 2]
edgeData$Module2 <- nodeattrib[match(edgeData$Node2, nodeattrib$nodeName), 2]
edgeData <- edgeData[edgeData$Module1==edgeData$Module2,c(1,3,4)]
nodeTotalWeight <- edgeData %>% group_by(Node1, Module1) %>% summarise(weight=sum(Weight))
nodeTotalWeight <- nodeTotalWeight[with(nodeTotalWeight, order(Module1, -weight)),]
nodeTotalWeightTop = nodeTotalWeight %>% group_by(Module1) %>% top_n(hug.top, weight) %>% data.frame()
write.table(nodeTotalWeightTop,'hug_gene_list.xls',sep='\t',quote=F)
lgroup <- unique(nodeTotalWeightTop$Module1)
lgroup <- lgroup[-grep('grey',lgroup)]
for (i in lgroup) {
nodes1 <- subset(nodeTotalWeightTop,Module1==i)
c1 <- which(edges[,1] %in% nodes1[,1])
edges1 <- edges[c1,]
c1 <- which(edges1[,2] %in% nodes1[,1])
edges1 <- edges1[c1,]
c1 <- which(nodes[,1] %in% nodes1[,1])
nodes1 <- nodes[c1,]
nodes1 <- subset(nodes1,group==i)
mtitle <- dim(nodes1)[1]
plot_network(edges=edges1,nodes=nodes1,group.by=NULL,weight='weight',coul = coul,layout=layout.sphere,pf=paste0("hug_genes_",i),main=mtitle)
}
计算基因模块与性状的相关性
trait=ob1@meta.data
if(!is.null(trait)) {
traitData = trait
sampleName = rownames(dataExpr)
traitData = traitData[match(sampleName, rownames(traitData)), ]
if (corType=="pearsoon") {
modTraitCor = cor(MEs_col, traitData, use = "p")
modTraitP = corPvalueStudent(modTraitCor, nSamples)
} else {
modTraitCorP = bicorAndPvalue(MEs_col, traitData, robustY=robustY)
modTraitCor = modTraitCorP$bicor
modTraitP = modTraitCorP$p
}
textMatrix = paste(signif(modTraitCor, 2), "\n(", signif(modTraitP, 1), ")", sep = "")
dim(textMatrix) = dim(modTraitCor)
pdf('fig6.labeledHeatmap.pdf',18,10)
labeledHeatmap(Matrix = modTraitCor, xLabels = colnames(traitData),
yLabels = colnames(MEs_col),
cex.lab = 0.5,
ySymbols = colnames(MEs_col), colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix, setStdMargins = FALSE,
cex.text = 0.5, zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
}
保存结果
save(corType,dataExpr, MEs_col, traitData, nSamples, net, file=paste0(label,'_result.RData'))
load(indata)
module='brown'
pheno='orig.ident'
if (corType=="pearsoon") {
geneModuleMembership = as.data.frame(cor(dataExpr, MEs_col, use = "p"))
MMPvalue = as.data.frame(corPvalueStudent(
as.matrix(geneModuleMembership), nSamples))
}else {
geneModuleMembershipA = bicorAndPvalue(dataExpr, MEs_col, robustY=robustY)
geneModuleMembership = geneModuleMembershipA$bicor
MMPvalue = geneModuleMembershipA$p
}
if (corType=="pearsoon") {
geneTraitCor = as.data.frame(cor(dataExpr, traitData, use = "p"))
geneTraitP = as.data.frame(corPvalueStudent(
as.matrix(geneTraitCor), nSamples))
} else {
geneTraitCorA = bicorAndPvalue(dataExpr, traitData, robustY=robustY)
geneTraitCor = as.data.frame(geneTraitCorA$bicor)
geneTraitP = as.data.frame(geneTraitCorA$p)
}
modNames = substring(colnames(MEs_col), 3)
module_column = match(module, modNames)
pheno_column = match(pheno,colnames(traitData))
moduleGenes = moduleColors == module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
pdf(paste0(module,"_",pheno,'_verboseScatterplot.pdf'),18,10)
verboseScatterplot(abs(geneModuleMembership[moduleGenes, module_column]),
abs(geneTraitCor[moduleGenes, pheno_column]),
xlab = paste("Module Membership in", module, "module"),
ylab = paste("Gene significance for", pheno),
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()
}
网友评论