本文目的:一文解决WGCNA分析问题。
原文章使用了自己识别的五个lncRNA,与mRNA合并做WGCNA分析,目的是为了得到lncRNA相关的mRNA。所以这里,我们做WGCNA,所需要的数据可以推测其包括:lncRNA表达量,mRNA表达矩阵,一些临床参数数据。
代码WGCNA_prepare.R(给WGCNA分析做前期数据准备)
# =======================================================
#########################################################
#########################################################
#作者:工程师2号 ########################################
#简书笔记博客(柳叶刀与小鼠标)##########################
# https://www.jianshu.com/u/619b87e54936 ###############
#########################################################
#########################################################
# =======================================================
# =======================================================
#### 下载表达数据################################
# =======================================================
library(TCGAbiolinks)
library(SummarizedExperiment)
library(dplyr)
library(DT)
library(tibble)
rm(list=ls())
setwd('D:\\test\\Five_key_lncRNAs\\chapter1\\download\\expression')
exp <- GDCquery(project = "TCGA-PAAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM")
GDCdownload(exp)
expdat <- GDCprepare(query =exp)
count_matrix= as.data.frame(assay(expdat))
setwd("D:\\Originaldata\\GRCH\\Homo_sapiens.GRCh38.90")
load("gtf_df.Rda")
test <- gtf_df[1:5,]
View(test)
fpkmToTpm <- function(fpkm)
{
exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
expr <- as.data.frame (apply(count_matrix , 2, fpkmToTpm))
expr <- expr %>% rownames_to_column("gene_id")
###表达矩阵中提取mRNA表达矩阵
mRNA_exprSet <- gtf_df %>%
dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>%
dplyr::select(c(gene_name,gene_id,gene_biotype)) %>%
dplyr::inner_join(expr,by ="gene_id") %>%
tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ")
save(mRNA_exprSet,file = "mRNA_exprSet.Rda")
# =======================================================
#### 提取mRNA表达矩阵################################
# =======================================================
mRNA_exprSet <- mRNA_exprSet %>%
tidyr::separate(gene_id, c("gene_name","gene_id","gene_biotype"),
sep = " \\| ")
mRNA_exprSet <- mRNA_exprSet[,-(2:3)]
index <- duplicated(mRNA_exprSet$gene_name)
mRNA_exprSet <- mRNA_exprSet[!index,]
row.names(mRNA_exprSet) <- mRNA_exprSet$gene_name
mRNA_exprSet$gene_name <- NULL
###第五节:删除癌旁样本和二次测序的样本
metadata <- data.frame(colnames(mRNA_exprSet))
for (i in 1:length(metadata[,1])) {
num <- as.numeric(as.character(substring(metadata[i,1],14,15)))
if (num == 1 ) {metadata[i,2] <- "T"}
if (num != 1) {metadata[i,2] <- "N"}
}
names(metadata) <- c("id","group")
metadata$group <- as.factor(metadata$group)
metadata <- subset(metadata,metadata$group == "T")
metadata
mRNA_exprSet1 <- mRNA_exprSet[,which(colnames(mRNA_exprSet) %in% metadata$id)]
metadata <- data.frame(colnames(mRNA_exprSet1))
for (i in 1:length(metadata[,1])) {
chr <- as.character(substring(metadata[i,1],22,25))
if ( chr == 'A277' ) {metadata[i,2] <- "Rep"}
if ( chr!= 'A277' ) {metadata[i,2] <- "T"}
}
names(metadata) <- c("id","group")
metadata$group <- as.factor(metadata$group)
metadata <- subset(metadata,metadata$group == "T")
metadata
###保存mRNA表达矩阵
mRNA_exprSet2 <- mRNA_exprSet1[,which(colnames(mRNA_exprSet1) %in% metadata$id)]
setwd('D:\\test\\Five_key_lncRNAs\\chapter3')
save(mRNA_exprSet2,file = 'mRNA_exprSet2.Rdata')
# =======================================================
####提取lncRNA表达矩阵###########################
# ======================================================
setwd('D:\\test\\Five_key_lncRNAs\\chapter3')
library(survival)
library(future.apply)
plan(multiprocess)
library(TCGAbiolinks)
library(dplyr)
library(DT)
library(tibble)
rm(list=ls())
survival_data <- read.csv('survival.csv',
header = T,row.names = 1) %>%
dplyr::select( Barcode,OS ,OS.Time)
load('mRNA_exprSet2.Rdata')
mRNA_exprSet <- mRNA_exprSet2
colnames(mRNA_exprSet) <- substr(x=colnames(mRNA_exprSet),
start = 1,
stop = 12)
mRNA_exprSet <- as.data.frame(t(mRNA_exprSet))
mRNA_exprSet <- mRNA_exprSet +0.001
a <- mRNA_exprSet[1:6,1:6]
myfun_cv <-function(x){
cv<- sd(x)/mean(x)
return(cv)
}
#挑选变异系数大于0.3的mRNA基因
cv_gene <- as.data.frame(apply(mRNA_exprSet, 2, myfun_cv))
colnames(cv_gene) <- 'cv'
cv_gene <- subset(cv_gene,cv_gene$cv > 0.3)
mRNA_exprSet <- mRNA_exprSet[,which(colnames(mRNA_exprSet) %in% rownames(cv_gene))]
mRNA_exprSet$Barcode <- rownames(mRNA_exprSet)
survival_dat <- merge(survival_data, mRNA_exprSet, by='Barcode')
####mRNA单因素分析,因为没必要用所有的mRNA来做WGCNA
#仅仅需要有生存意义的mRNA和我们选定的三个lncRNA即可
colnames(survival_dat) <- sub("\\-", "", colnames(survival_dat))
colnames(survival_dat) <- sub("\\.", "_", colnames(survival_dat))
colnames(survival_dat) <- sub("\\:", "_", colnames(survival_dat))
covariates <- as.character(colnames(survival_dat))[-(1:3)]
univ_formulas <- sapply(covariates,
function(x){
as.formula(paste('Surv(OS_Time, OS)~', x))})
univ_models <- future_lapply( univ_formulas,
function(x){coxph(x, data = survival_dat)})
univ_results <- lapply(univ_models,
function(x){
x <- summary(x)
p.value <- signif(x$wald["pvalue"], digits = 2)
beta <- signif(x$coef[1], digits = 2)
HR <- signif(x$coef[2], digits = 2)
HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
HR.confint.upper <- signif(x$conf.int[,"upper .95"], 2)
HR <- paste0(HR, " (",
HR.confint.lower, "-", HR.confint.upper, ")")
res <- c(beta, HR, p.value)
names(res) <- c("coef", "HR (95% CI for HR)", "p.value")
return(res)
})
res <- as.data.frame(t(do.call(cbind, univ_results)))
res[1:6,]
res$p.value <- as.numeric(as.character(res$p.value))
res$coef <- as.numeric(as.character(res$coef ))
res[1:6,]
table(res$p.value < 0.05)
res <- res[res$p.value <= 0.05, ]
res <- res[order(res$p.value), ]
single_gene <- rownames(res)
#single_gene即为得到的单因素cox分析中P小于0.05的基因
sig_genes <- survival_dat[ ,which(colnames(survival_dat) %in% single_gene)]
rownames(sig_genes) <- survival_dat$Barcode
save(sig_genes,file = 'sig_genes.Rdata')
#得到了有生存意义的mRNA表达矩阵
# =======================================================
####合并选定的lncRNA,以及mRNA表达矩阵####################
# =======================================================
library(dplyr)
library(tidyr)
setwd('D:\\test\\Five_key_lncRNAs\\chapter3')
rm(list=ls())
load( 'sig_genes.Rdata')
lncRNA <- read.table('diffmRNAExp.txt',header = T,sep = '\t')
#diffmRNAExp.txt文件是上一节生成的差异基因文件
univariate_data <- read.table('diffmRNAExp.txt',header = T,row.names = 1,
sep = '\t')
univariate_data <- log2(univariate_data +0.001)
metadata <- data.frame(colnames(univariate_data))
for (i in 1:length(metadata[,1])) {
num <- as.numeric(as.character(substring(metadata[i,1],14,15)))
if (num == 1 ) {metadata[i,2] <- "T"}
if (num != 1) {metadata[i,2] <- "N"}
}
names(metadata) <- c("id","group")
metadata$group <- as.factor(metadata$group)
metadata <- subset(metadata,metadata$group == "T")
metadata
exprSet <- univariate_data[,which(colnames(univariate_data) %in% metadata$id)]
colnames(exprSet) <- substr(x=colnames(exprSet),start = 1,stop = 12)
colnames(exprSet) <- chartr(old='.',new = '-',x=colnames(exprSet) )
survival <- read.csv('survival.csv',header = T,row.names = 1)
select_colname <- function(expr,name){
expr <- expr[,which(colnames(expr) %in% name[,1])]
expr <- expr[,order(names(expr))]
name <- name[which(name[,1] %in% colnames(expr)),]
name <- name[order(name[,1]),]
expr_name <- list( expr,name)
return( expr_name)
}
dat <- select_colname(exprSet,survival )
data <- as.data.frame(t(dat[[1]]))
data$Barcode <- row.names(data)
survival <- survival %>%
dplyr::select(Barcode,OS,OS.Time,OS.Time,Age,Grade,TNM)
survival_dat <- merge(survival,data,by='Barcode')
library(stringr)
survival_dat$Grade <- str_extract(survival_dat$Grade,pattern = '\\d')
survival_dat$TNM <- str_extract(survival_dat$TNM,pattern = 'T\\d')
survival_dat$TNM <- str_extract(survival_dat$TNM,pattern = '\\d')
getmode <- function(v){
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v,uniqv)))]
}
# cha <- c('w','w','w','e','e','r')
# num <- c(0,0,1,2,2,3,4,4,4,4,6,6)
#
# getmode(num)
# getmode(cha)
#
library(Hmisc)
library(mice)
md.pattern(survival_dat)
survival_dat$TNM <- impute(survival_dat$TNM,getmode)
survival_dat$Grade <- impute(survival_dat$Grade,getmode)
survival_dat <- survival_dat %>%
dplyr::select( Barcode,OS,OS.Time ,Age,
Grade, TNM , MIR202HG,
LINC02260,LINC01894 )
md.pattern(survival_dat)
sig_genes$Barcode <- rownames(sig_genes)
data_wgcna <- merge(survival_dat,sig_genes,by='Barcode')
save(data_wgcna,file = 'data_wgcna.Rdata')
#保存用于WGCNA分析的数据,行为样本名
#列为基因或者临床属性
# =======================================================
#########################################################
#########################################################
#作者:工程师2号 ########################################
#简书笔记博客(柳叶刀与小鼠标)##########################
# https://www.jianshu.com/u/619b87e54936 ###############
#########################################################
#########################################################
# =======================================================
网友评论