美文网首页TCGA数据挖掘生信医学猴哥
用MCP法计算TCGA样本中的免疫浸润

用MCP法计算TCGA样本中的免疫浸润

作者: 碌碌无为的杰少 | 来源:发表于2020-11-16 01:31 被阅读0次

    下载TCGA FPKM数据

    rm(list = ls())
    
    # 安装加载需要的R包
    # install.packages("pacman", repos = 'https://mirror.lzu.edu.cn/CRAN/')
    library(pacman)
    p_load(TCGAbiolinks, tidyverse, magrittr, data.table, biomaRt)
    library(dplyr)
    # 设置工作目录
    projectPath = "C:/Users/81494/Desktop/changai fpkm"     # 替换为真实的工作目录
    dataPath = paste(projectPath, "Data", sep = "/")
    if(!dir.exists(dataPath)) dir.create(dataPath)  # 新建一个Data文件夹,用于存放从TCGA上下载的数据
    setwd(dataPath)
    library(dplyr)
    
    #### 使用TCGAbiolinks下载Counts和FPKM-UQ数据 ----
    #https://www.bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html
    for(data_type in c("HTSeq - Counts", "HTSeq - FPKM")){
        
        repeat{
            query = try(GDCquery(project = "TCGA-COAD", legacy = FALSE, experimental.strategy = "RNA-Seq", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = data_type), silent = TRUE)
            if(class(query) != "try-error") {
                break
            }else{
                print("Try to connect to GDC...")
            }
        }
        GDCdownload(query, files.per.chunk = 100)   # 分批下载
    
        # 数据预处理
        dataAssy = GDCprepare(query, summarizedExperiment = F)
        
        # # 保存变量,以备后用
        # filename = paste0("TCGA-LIHC.RNA.query.", str_match(data_type, "- ([^\\s]*)$")[,2], ".RData")
        # save(query, dataAssy, file = filename)
        # load(filename)
        
        # 数据整理
        exp_data = dataAssy %>% 
            filter(str_detect(X1, "^ENSG")) %>% # 提取ENSG*注释的基因
            mutate(X1 = gsub("\\..*", "", X1)) %>% # 删除版本号
            rename(ensembl_gene_id = X1) %>% # "X1"重命名为"ensembl_gene_id"
            set_colnames(str_replace(colnames(.), "(-\\w*){3}$", "")) # 修改TCGA样本名
            # 上两行等同于:
            # set_colnames(c("ensembl_gene_id", str_match(colnames(.)[-1], "(TCGA-[^-]*-[^-]*-[^-]*)")[,2])) # 修改列名
        exp_data[1:3,1:3]
        
       
        
        # 输出
        outfile = paste0("COAD_Portal_RNA_", str_match(data_type, "- ([^\\s]*)$")[,2], ".txt")
        fwrite(exp_data, outfile, row.names = F, sep = "\t", quote = F)
        
    

    下载TCGA临床数据

    XENA官网下载整合好的数据

    p_load(data.table, tidyverse, magrittr)
    clinic_data = read.table("COAD_clinicalMatrix", header = T, sep = "\t")
    surv_data = read.table("COAD_survival.txt", header = T, sep = "\t")
    clinic_data = merge(clinic_data, surv_data, by.x = "sampleID", by.y = "sample")
    rownames(clinic_data) <- clinic_data$sampleID
    clinic_data=clinic_data[,c('sampleID','CDE_ID_3226963','MSI_updated_Oct62011','loss_expression_of_mismatch_repair_proteins_by_ihc','loss_expression_of_mismatch_repair_proteins_by_ihc_result','microsatellite_instability','pathologic_M','pathologic_N','pathologic_T','pathologic_stage','OS','OS.time','DSS','DSS.time','PFI','PFI.time')]
    

    整理表达矩阵

    exp_FPKM = fread("COAD_Portal_RNA_FPKM.txt", h = T, sep = "\t", check.names = F) %>% column_to_rownames("ensembl_gene_id")
    table(!duplicated(colnames(exp_FPKM)))
    exp_FPKM = exp_FPKM[,!duplicated(colnames(exp_FPKM))]#去重复测序两次的标本
    exp_FPKM_T=exp_FPKM[,str_sub(colnames(exp_FPKM),14,16)=="01A"]#肿瘤样本
    colnames(exp_FPKM_T) <- str_sub(colnames(exp_FPKM_T),1,15)
    dd1 <- exp_FPKM_T[1:20,1:20]
    

    提取mRNA和gene名转换

    load("anno.Rdata")
    #step4:表达矩阵拆分和注释
    mRNA_anno <- mRNA_anno %>% 
        tidyr::separate(gene_id,into = c("gene_id","drop"),sep="\\.")
    
    sum(rownames(exp_FPKM_T) %in% mRNA_anno$gene_id)
    mRNA_exp = exp_FPKM_T[rownames(exp_FPKM_T) %in% mRNA_anno$gene_id,]
    tmp = data.frame(gene_id = rownames(exp_FPKM_T))
    x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
    #inner_join不改变顺序,可以确认一下
    table(!duplicated(x$gene_name))
    #行名不允许重复,因此一个ensambelid对应多个symbol的需要去掉。
    mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
    x = x[!duplicated(x$gene_name),]
    rownames(mRNA_exp) = x$gene_name
    mRNA_exp = na.omit(mRNA_exp)
    

    表达矩阵和临床数据融合

    table(colnames(mRNA_exp) %in% clinic_data$sample)
    
    index <- intersect(colnames(mRNA_exp),clinic_data$sample)
    
    mRNA_exp1 = mRNA_exp[,index]
    
    clinic_data1 <- clinic_data[index,]
    
    write.csv(clinic_data1,'clinic_data2.csv') 
    

    手工整理临床数据

    在csv中剔除生存时间小于90天和NA的,和存活时间大于3000天的病例
    

    表达矩阵和临床数据再融合

    clinic_data2 <- read.csv('clinic_data2.csv',header = T)
    rownames(clinic_data2) <- clinic_data2$sampleID
    table(colnames(mRNA_exp) %in% clinic_data2$sampleID)
    index1 <- intersect(colnames(mRNA_exp),clinic_data2$sampleID)
    mRNA_exp2 = mRNA_exp[,index1]
    clinic_data2 <- clinic_data2[index1,]
    

    MCP法

    install.packages("MCPcounter-master/Source/",repos=NULL,type="source")
    library(MCPcounter)
    genes <- data.table::fread("genes.txt",data.table = F)
    results<- MCPcounter.estimate(mRNA_exp2,
                                  featuresType= "HUGO_symbols",
                                  probesets=probesets,
                                  genes=genes)
    results2 <- results[1:9,]#去除最后一行fbro细胞的行
    library(pheatmap)
    pheatmap(results2, #热图的数据
             cluster_rows = F,#行聚类
             cluster_cols = T,#列聚类,可以看出样本之间的区分度
             ##annotation_col =annotation_col, #标注样本分类
             annotation_legend=TRUE, # 显示注释
             show_rownames = T,
             show_colnames = F,# 显示行名
             scale = "row", #以行来标准化
             color =colorRampPalette(c("blue", "white","red"))(100),#调色
             #filename = "heatmap_F.pdf",#是否保存
    
    图片.png

    生存分析

    results1 <- as.data.frame(t(results))
    Last <- cbind(results1,clinic_data2)
    Last$group = ifelse(Last$`B lineage` > median(Last$`B lineage`),'high','low')
    library(survival)
    library(survminer)
    sfit1=survfit(Surv(OS.time, OS)~group, data=Last)
    ggsurvplot(sfit1,pval =TRUE, data = Last,risk.table = TRUE)
    sfit1=survfit(Surv(DSS.time, DSS)~group, data=Last)
    ggsurvplot(sfit1,pval =TRUE, data = Last, risk.table = TRUE)
    #surv.cut = surv_cutpoint(Last, time = "OS.time", event = "OS", #variables = "B lineage", minprop = 0.3)
    # 最优分类值
    #opticut = surv.cut$cutpoint$cutpoint 
    #opticut
    
    图片.png

    相关文章

      网友评论

        本文标题:用MCP法计算TCGA样本中的免疫浸润

        本文链接:https://www.haomeiwen.com/subject/cyuamktx.html