重复一篇3分左右纯生信文章(第三部分)

作者: 柳叶刀与小鼠标 | 来源:发表于2019-07-21 23:51 被阅读177次

    本文目的:一文解决WGCNA分析问题。

    原文章使用了自己识别的五个lncRNA,与mRNA合并做WGCNA分析,目的是为了得到lncRNA相关的mRNA。所以这里,我们做WGCNA,所需要的数据可以推测其包括:lncRNA表达量,mRNA表达矩阵,一些临床参数数据。

    代码WGCNA_prepare.R(给WGCNA分析做前期数据准备)

    # =======================================================
    #########################################################
    #########################################################
    #作者:工程师2号 ########################################
    #简书笔记博客(柳叶刀与小鼠标)##########################
    # https://www.jianshu.com/u/619b87e54936 ###############
    #########################################################
    #########################################################
    # =======================================================
    
    
    
    
    # =======================================================
    #### 下载表达数据################################
    # =======================================================
    
    
    library(TCGAbiolinks)
    library(SummarizedExperiment)
    library(dplyr)
    library(DT)
    library(tibble)
    rm(list=ls())
    
    setwd('D:\\test\\Five_key_lncRNAs\\chapter1\\download\\expression')
    
    exp <- GDCquery(project = "TCGA-PAAD", 
                    data.category = "Transcriptome Profiling", 
                    data.type = "Gene Expression Quantification", 
                    workflow.type = "HTSeq - FPKM")
    
    GDCdownload(exp)
    
    expdat <- GDCprepare(query =exp)
    
    count_matrix= as.data.frame(assay(expdat))
    
    
    
    
    setwd("D:\\Originaldata\\GRCH\\Homo_sapiens.GRCh38.90")
    
    load("gtf_df.Rda")
    
    test <- gtf_df[1:5,]
    
    View(test)
    
    
    
    fpkmToTpm <- function(fpkm)
    {
      exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
    }
    
    
    expr <- as.data.frame (apply(count_matrix  , 2, fpkmToTpm))
    
    
    expr <-  expr %>% rownames_to_column("gene_id")
    
    
    ###表达矩阵中提取mRNA表达矩阵
    
    mRNA_exprSet <- gtf_df %>%
      dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>%
      dplyr::select(c(gene_name,gene_id,gene_biotype)) %>%
      dplyr::inner_join(expr,by ="gene_id") %>%
      tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ")
    
    
    save(mRNA_exprSet,file = "mRNA_exprSet.Rda")
    
    
    # =======================================================
    #### 提取mRNA表达矩阵################################
    # =======================================================
    
    mRNA_exprSet <- mRNA_exprSet %>%
      tidyr::separate(gene_id, c("gene_name","gene_id","gene_biotype"),
                      sep = " \\| ")
    
    
    mRNA_exprSet <- mRNA_exprSet[,-(2:3)]
    
    
    index <- duplicated(mRNA_exprSet$gene_name)
    mRNA_exprSet <- mRNA_exprSet[!index,]
    row.names(mRNA_exprSet) <- mRNA_exprSet$gene_name
    mRNA_exprSet$gene_name <- NULL
    
    
    ###第五节:删除癌旁样本和二次测序的样本
    metadata <- data.frame(colnames(mRNA_exprSet))
    
    
    for (i in 1:length(metadata[,1])) {
      num <- as.numeric(as.character(substring(metadata[i,1],14,15)))
      if (num == 1 ) {metadata[i,2] <- "T"}
      if (num != 1) {metadata[i,2] <- "N"}
    }
    names(metadata) <- c("id","group")
    metadata$group <- as.factor(metadata$group)
    
    
    metadata <- subset(metadata,metadata$group == "T")
    metadata
    
    mRNA_exprSet1 <- mRNA_exprSet[,which(colnames(mRNA_exprSet) %in% metadata$id)]
    
    metadata <- data.frame(colnames(mRNA_exprSet1))
    for (i in 1:length(metadata[,1])) {
      chr <-  as.character(substring(metadata[i,1],22,25))
      if ( chr == 'A277' ) {metadata[i,2] <- "Rep"}
      if ( chr!= 'A277' ) {metadata[i,2] <- "T"}
    }
    
    names(metadata) <- c("id","group")
    metadata$group <- as.factor(metadata$group)
    metadata <- subset(metadata,metadata$group == "T")
    metadata
    
    
    ###保存mRNA表达矩阵
    
    mRNA_exprSet2 <- mRNA_exprSet1[,which(colnames(mRNA_exprSet1) %in% metadata$id)]
    
    setwd('D:\\test\\Five_key_lncRNAs\\chapter3')
    
    save(mRNA_exprSet2,file = 'mRNA_exprSet2.Rdata')
    
    
    # =======================================================
    ####提取lncRNA表达矩阵###########################
    # ======================================================
    
    
    setwd('D:\\test\\Five_key_lncRNAs\\chapter3')
    library(survival)
    library(future.apply)
    plan(multiprocess)
    
    library(TCGAbiolinks)
    library(dplyr)
    library(DT)
    library(tibble)
    
    rm(list=ls())
    
    survival_data <- read.csv('survival.csv',
                              header = T,row.names = 1)  %>%
      dplyr::select( Barcode,OS  ,OS.Time)
    
    
    
    load('mRNA_exprSet2.Rdata')
    
    
    mRNA_exprSet <- mRNA_exprSet2
    
    colnames(mRNA_exprSet) <- substr(x=colnames(mRNA_exprSet),
                                     start = 1,
                                     stop = 12)
    
    mRNA_exprSet <- as.data.frame(t(mRNA_exprSet))
    
    mRNA_exprSet  <- mRNA_exprSet +0.001
    
    a <- mRNA_exprSet[1:6,1:6]
    
    
    
    myfun_cv <-function(x){
      cv<-  sd(x)/mean(x)  
      return(cv) 
    }
    
    
    #挑选变异系数大于0.3的mRNA基因
    cv_gene <-  as.data.frame(apply(mRNA_exprSet, 2, myfun_cv))
    
    colnames(cv_gene) <- 'cv'
    
    cv_gene <- subset(cv_gene,cv_gene$cv > 0.3)
    
    mRNA_exprSet <- mRNA_exprSet[,which(colnames(mRNA_exprSet) %in% rownames(cv_gene))]
    
    mRNA_exprSet$Barcode <- rownames(mRNA_exprSet)
    
    survival_dat <- merge(survival_data, mRNA_exprSet, by='Barcode')
    
    
    ####mRNA单因素分析,因为没必要用所有的mRNA来做WGCNA
    #仅仅需要有生存意义的mRNA和我们选定的三个lncRNA即可
    
    colnames(survival_dat) <- sub("\\-", "", colnames(survival_dat))
    
    colnames(survival_dat) <- sub("\\.", "_", colnames(survival_dat))
    
    colnames(survival_dat) <- sub("\\:", "_", colnames(survival_dat))
    
    covariates <- as.character(colnames(survival_dat))[-(1:3)]
    
    univ_formulas <- sapply(covariates,
                            function(x){
                              as.formula(paste('Surv(OS_Time, OS)~', x))})
    
    univ_models <- future_lapply( univ_formulas,
                                  function(x){coxph(x, data = survival_dat)})
    
    
    univ_results <-  lapply(univ_models,
                            function(x){ 
                              x <- summary(x)
                              p.value <- signif(x$wald["pvalue"], digits = 2)
                              beta <- signif(x$coef[1], digits = 2)
                              HR <- signif(x$coef[2], digits = 2)
                              HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
                              HR.confint.upper <- signif(x$conf.int[,"upper .95"], 2)
                              HR <- paste0(HR, " (", 
                                           HR.confint.lower, "-", HR.confint.upper, ")")
                              res <- c(beta, HR, p.value)
                              names(res) <- c("coef", "HR (95% CI for HR)", "p.value")
                              return(res)
                            })
    
    
    res <- as.data.frame(t(do.call(cbind, univ_results)))
    
    res[1:6,]
    
    res$p.value <- as.numeric(as.character(res$p.value))
    
    res$coef <- as.numeric(as.character(res$coef ))
    
    res[1:6,]
    
    table(res$p.value < 0.05)
    
    res  <- res[res$p.value <= 0.05, ]
    
    res  <- res[order(res$p.value), ]
    
    single_gene <- rownames(res)
    #single_gene即为得到的单因素cox分析中P小于0.05的基因
    
    sig_genes <- survival_dat[ ,which(colnames(survival_dat) %in% single_gene)]
    
    rownames(sig_genes) <- survival_dat$Barcode
    
    save(sig_genes,file = 'sig_genes.Rdata')
    #得到了有生存意义的mRNA表达矩阵
    
    
    # =======================================================
    ####合并选定的lncRNA,以及mRNA表达矩阵####################
    # =======================================================
    
    library(dplyr)
    
    library(tidyr)
    
    setwd('D:\\test\\Five_key_lncRNAs\\chapter3')
    
    rm(list=ls())
    
    load( 'sig_genes.Rdata')
    
    lncRNA <- read.table('diffmRNAExp.txt',header = T,sep = '\t')
    
    
    #diffmRNAExp.txt文件是上一节生成的差异基因文件
    univariate_data <- read.table('diffmRNAExp.txt',header = T,row.names = 1,
                                  sep = '\t')
    
    univariate_data  <- log2(univariate_data +0.001)
    
    metadata <- data.frame(colnames(univariate_data))
    
    
    
    
    
    for (i in 1:length(metadata[,1])) {
      num <- as.numeric(as.character(substring(metadata[i,1],14,15)))
      if (num == 1 ) {metadata[i,2] <- "T"}
      if (num != 1) {metadata[i,2] <- "N"}
    }
    names(metadata) <- c("id","group")
    metadata$group <- as.factor(metadata$group)
    
    
    metadata <- subset(metadata,metadata$group == "T")
    metadata
    
    exprSet <- univariate_data[,which(colnames(univariate_data) %in% metadata$id)]
    
    colnames(exprSet)  <- substr(x=colnames(exprSet),start = 1,stop = 12)
    colnames(exprSet)  <- chartr(old='.',new = '-',x=colnames(exprSet) )
    
    survival <- read.csv('survival.csv',header = T,row.names = 1)
    
    
    
    
    select_colname <- function(expr,name){
      expr <- expr[,which(colnames(expr) %in% name[,1])]
      expr  <- expr[,order(names(expr))]   
      name <- name[which(name[,1] %in% colnames(expr)),]
      name <- name[order(name[,1]),]
      expr_name <- list( expr,name)
      return( expr_name)
    }
    
    dat <- select_colname(exprSet,survival )
    
    
    data <- as.data.frame(t(dat[[1]]))
    
    
    data$Barcode <- row.names(data)
    survival <- survival %>%
      dplyr::select(Barcode,OS,OS.Time,OS.Time,Age,Grade,TNM)
    
    
    survival_dat  <- merge(survival,data,by='Barcode')
    
    library(stringr)
    
    survival_dat$Grade <- str_extract(survival_dat$Grade,pattern = '\\d')
    survival_dat$TNM <- str_extract(survival_dat$TNM,pattern = 'T\\d')
    survival_dat$TNM <- str_extract(survival_dat$TNM,pattern = '\\d')
    
    
    getmode <- function(v){
      uniqv <- unique(v)
      uniqv[which.max(tabulate(match(v,uniqv)))]
    }
    
    # cha <- c('w','w','w','e','e','r')
    # num <- c(0,0,1,2,2,3,4,4,4,4,6,6)
    # 
    # getmode(num)
    # getmode(cha)
    # 
    
    library(Hmisc)
    library(mice)
    md.pattern(survival_dat)
    
    
    survival_dat$TNM <- impute(survival_dat$TNM,getmode)
    survival_dat$Grade <- impute(survival_dat$Grade,getmode)
    
    
    survival_dat <- survival_dat %>%
      dplyr::select( Barcode,OS,OS.Time ,Age,
                     Grade, TNM , MIR202HG,
                     LINC02260,LINC01894 )
    
    
    md.pattern(survival_dat)
    
    sig_genes$Barcode <- rownames(sig_genes)
    
    data_wgcna <- merge(survival_dat,sig_genes,by='Barcode')
    
    save(data_wgcna,file = 'data_wgcna.Rdata')
    
    #保存用于WGCNA分析的数据,行为样本名
    #列为基因或者临床属性
    # =======================================================
    #########################################################
    #########################################################
    #作者:工程师2号 ########################################
    #简书笔记博客(柳叶刀与小鼠标)##########################
    # https://www.jianshu.com/u/619b87e54936 ###############
    #########################################################
    #########################################################
    # =======================================================
    
    
    

    相关文章

      网友评论

        本文标题:重复一篇3分左右纯生信文章(第三部分)

        本文链接:https://www.haomeiwen.com/subject/mccblctx.html