获取表达数据矩阵
这里运行R语言包GDCRNATools的帮助文档中的例子获得胆管癌的rna表达矩阵
library(GDCRNATools)
project<-'TCGA-CHOL'
rnadir<-paste(project,'RNAseq',sep='/')
clinicaldir<-paste(project,'Clinical',sep='/')
gdcRNADownload(project.id = 'TCGA-CHOL',
data.type = 'RNAseq',
write.manifest = F,
method = 'gdc-client',
directory = rnadir)
gdcClinicalDownload(project.id = 'TCGA-CHOL',
write.manifest = F,
method='gdc-client',
directory = clinicaldir)
metaMatrix.RNA<-gdcParseMetadata(project.id = 'TCGA-CHOL',
data.type = 'RNAseq',
write.meta = F)
metaMatrix.RNA<-gdcFilterDuplicate(metaMatrix.RNA)
metaMatrix.RNA<-gdcFilterSampleType(metaMatrix.RNA)
print("样本比例:")
table(metaMatrix.RNA$sample_type)
rnaCounts<-gdcRNAMerge(metadata = metaMatrix.RNA,
path = rnadir,
organized = FALSE,
data.type = 'RNAseq')
使用R语言包edgeR做差异表达分析
这部分代码完全来自公众号生信星球文章TCGA(转录组)差异分析三大R包及其结果对比
library(stringr)
group_list<-ifelse(str_sub(colnames(rnaCounts),14,15)=="01","tumor","normal")
group_list
table(group_list)
library(edgeR)
deg<-DGEList(counts=rnaCounts,group=group_list)
deg$samples$lib.size<-colSums(deg$counts)
deg<-calcNormFactors(deg)
design<-model.matrix(~0+group_list)
rownames(design)<-colnames(deg)
colnames(design)<-levels(group_list)
dge<-estimateGLMCommonDisp(deg,design)
dge<-estimateGLMTrendedDisp(dge,design)
dge<-estimateGLMTagwiseDisp(dge,design)
fit<-glmFit(dge,design)
fit2<-glmLRT(fit,contrast = c(-1,1))
DEG<-topTags(fit2,n=nrow(rnaCounts))
DEG<-as.data.frame(DEG)
logFC_cutoff<-with(DEG,mean(abs(logFC))+2*sd(abs(logFC)))
logFC_cutoff
DEG$change<-as.factor(ifelse(DEG$PValue<0.05&abs(DEG$logFC)>logFC_cutoff,
ifelse(DEG$logFC>logFC_cutoff,"UP","DOWN"),"NOT"))
head(DEG)
table(DEG$change)
绘制差异基因聚类热图和火山图
source("../3-plotfunction.R")
cg1<-rownames(DEG)[DEG$change!="NOT"]
h1<-draw_heatmap(rnaCounts[cg1,],group_list)
v1<-draw_volcano(test=DEG[,c(1,4,6)])
png("edgeR_DEG.png",width=1000)
ggpubr::ggarrange(h1,v1,ncol=2,nrow = 1)
dev.off()
image.png
接下来利用差异表达基因做加权基因共表达网络(WGCNA)
WGCNA分析用到的代码是我在腾讯课堂上购买的一门课程,课程内容是介绍WGCNA分析在植物上的应用的。课程的内容包括基本原理的介绍,讲解文献,实例操作。我自己感觉内容还是非常棒的。课程名字我就不再这里放了,大家如果感兴趣可以给我的公众号留言。课程中用到的代码和数据如果大家需要的话也可以给我的公众号留言。
另外还有一大部分代码来自生信技能树公众号文章七步走纯R代码通过数据挖掘复现一篇实验文章(第七步WGCNA)
#标准化基因表达矩阵,这一步是因为我之前尝试WGCNA分析的时候使用原始的counts会遇到报错
#提示输入数据不能为整数,我还不知道WGCNA应该用什么作为输入数据
expCPM<-cpm(rnaCounts, normalized.lib.sizes=TRUE)
expCPM[1:3,1:3]
#将用到的文件保存下来
write.csv(rnaCounts,file="TCGA-CHOL-rnaCounts.csv",quote = F)
write.csv(expCPM,file="TCGA-CHOL-expCPM.csv",quote=F)
#只选用差异表达基因做分析,我这么做事为了减小数据量,缩短运算时间
#我现在还不知道,只用差异表达基因与用完整的基因集有什么区别
expCPM<-expCPM[cg1,]
dim(expCPM)
library(WGCNA)
enableWGCNAThreads()
datExpr<-as.data.frame(t(expCPM))
group_list=factor(ifelse(as.numeric(substr(rownames(datExpr),14,15)) < 10,'tumor','normal'))
table(group_list)
datTraits<-data.frame(row.names = rownames(datExpr),
subtype=group_list)
#根据层次聚类绘制样本树
tree=hclust(dist(datExpr),method = 'average')
png("hclusttree.png",width=800)
plot(tree)
dev.off()
image.png
选择软阈值
powers1<-c(1:10,seq(12,30,by=2))
sft = pickSoftThreshold(datExpr,
RsquaredCut = 0.9,
powerVector = powers1,
verbose = 5)
png("pickSoftThres.png",width=1000)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers1,cex=cex1,col="red");
abline(h=0.90,col="red")
abline(h=0.85,col="blue")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers1, cex=cex1,col="red")
dev.off()
image.png
sft$powerEstimate
setwd("WGCNA_example/")
net = blockwiseModules(
datExpr,
power = sft$powerEstimate,
maxBlockSize = 10000,
TOMType = "unsigned",
deepSplit = 2, minModuleSize = 30,
mergeCutHeight = 0.5,
numericLabels = TRUE,
pamRespectsDendro = FALSE,
saveTOMs = TRUE,
saveTOMFileBase = "FPKM-TOM",
loadTOMs = FALSE,
verbose = 3
)
table(net$colors)
moduleColors = labels2colors(net$colors)
table(moduleColors)
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
for(module in substring(colnames(MEs),3)){
if(module == "grey") next
ME=as.data.frame(MEs[,paste("ME",module,sep="")])
colnames(ME)=module
datModExpr=datExpr[,moduleColors==module]
datKME = signedKME(datModExpr, ME)
datKME=cbind(datKME,rep(module,length(datKME)))
write.table(datKME,quote = F,row.names = T,append = T,file = "All_Gene_KME.txt",col.names = F)
}
png("1.png",width=1000)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
"Module colors",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
image.png
design=model.matrix(~0+ datTraits$subtype)
design = as.data.frame(design)
colnames(design)=levels(datTraits$subtype)
moduleTraitCor = cor(MEs, design , use = "p")
nSamples = nrow(datExpr)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
# Display the correlation values within a heatmap plot
png("2.png")
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = colnames(design),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = greenWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
image.png
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]];
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 14);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
png("3.png")
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
dev.off()
image.png
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
tumor = as.data.frame(design[,2])
names(tumor) = "tumor"
MET = orderMEs(cbind(MEs, tumor))
png("4.png")
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2),
marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)
dev.off()
image.png
第一次完整走完加权基因共表达网络的流程!可是如何解读结果还需要多看文章呀!
欢迎大家关注我的公众号
小明的数据分析笔记本
如果需要文章中的数据,可以在我的公众号留言!
中国加油!武汉加油!
网友评论