美文网首页
数据挖掘:是时候更新一下TCGA的数据了

数据挖掘:是时候更新一下TCGA的数据了

作者: 生信探索 | 来源:发表于2024-05-07 20:54 被阅读0次

    TCGA在去年更新之后提供了Count、TPM、FPKM三种格式的mRNA表达量数据,同时提供了ensembl gene ID、基因名、基因类型,因此有必要更新一下数据了。

    安装需要的R包

    install.packages("tidyverse")

    install.packages("arrow")

    install.packages("data.table")

    install.packages("magrittr")

    install.packages("pacman")

    if (!requireNamespace("BiocManager", quietly = TRUE)) {

      install.packages("BiocManager")

    }

    BiocManager::install("BioinformaticsFMRP/TCGAbiolinksGUI.data")

    BiocManager::install("BioinformaticsFMRP/TCGAbiolinks")

    TCGA数据版本信息

    rm(list = ls())

    library(pacman)

    p_load(magrittr, tidyverse, TCGAbiolinks, data.table, arrow)

    TCGAbiolinks::getGDCInfo()

    # $commit

    # [1] "4dd3680528a19ed33cfc83c7d049426c97bb903b"

    # $data_release

    # [1] "Data Release 36.0 - December 12, 2022"

    # $status

    # [1] "OK"

    # $tag

    # [1] "3.0.0"

    # $version

    # [1] 1

    建几个文件夹

    mkdir mRNA miRNA SNV CNV Protein

    需要下载的数据

    gdc_projects <- TCGAbiolinks::getGDCprojects() %>%

      pull(id) %>%

      grep(pattern = "^TCGA", x = ., value = T) %>%

      str_remove("TCGA-")

    gdc_projects

    # [1] "CHOL" "LIHC" "DLBC" "BLCA" "ACC"  "CESC" "PCPG" "PAAD" "MESO" "TGCT"

    # [11] "KIRP" "UVM"  "UCS"  "THYM" "COAD" "ESCA" "GBM"  "KICH" "HNSC" "PRAD"

    # [21] "OV"  "LUSC" "LAML" "LGG"  "SARC" "BRCA" "READ" "LUAD" "STAD" "THCA"

    # [31] "KIRC" "SKCM" "UCEC"

    下载mRNA表达量数据

    downRNA <- function(cancer) {

      query <- TCGAbiolinks::GDCquery(

        project = paste0("TCGA-", cancer),

        data.category = "Transcriptome Profiling",

        data.type = "Gene Expression Quantification",

        workflow.type = "STAR - Counts",

        legacy = FALSE

      )

      TCGAbiolinks::GDCdownload(query, files.per.chunk = 50)

      data <- TCGAbiolinks::GDCprepare(query, summarizedExperiment = F)

      data %<>% dplyr::filter(str_detect(gene_id, "^EN"))

      dt <- data %>% dplyr::select(gene_id, gene_name, gene_type, starts_with("unstranded"), starts_with("tpm"), starts_with("fpkm_unstranded"))

      colnames(dt) %<>% str_remove("_unstranded") %>% str_replace("unstranded", "count")

      arrow::write_ipc_file(dt, str_glue("mRNA/TCGA_{cancer}_mRNA.arrow", compression = "zstd", compression_level = 1))

      return(NULL)

    }

    walk(gdc_projects, downRNA)

    下载其他几种数据的函数

    download <- function(

        cancer,

        folder_name,

        data_category = FALSE,

        data_type = FALSE,

        workflow_type = FALSE,

        experimental_strategy = FALSE,

        legacy = FALSE) {

      query <- TCGAbiolinks::GDCquery(

        project = paste0("TCGA-", cancer),

        data.category = data_category,

        data.type = data_type,

        experimental.strategy = experimental_strategy,

        workflow.type = workflow_type,

        legacy = legacy

      )

      TCGAbiolinks::GDCdownload(query, files.per.chunk = 50)

      TCGAbiolinks::GDCprepare(query = query, summarizedExperiment = FALSE) %>%

        arrow::write_ipc_file(., str_glue("{folder_name}/TCGA_{cancer}_{folder_name}.arrow", compression = "zstd", compression_level = 1))

    }

    下载microRNA表达量数据

    walk(gdc_projects, download, folder_name = "miRNA", data_category = "Transcriptome Profiling", data_type = "miRNA Expression Quantification", experimental_strategy = "miRNA-Seq")

    下载SNV数据

    walk(gdc_projects, download, folder_name = "SNV", data_category = "Simple Nucleotide Variation", data_type = "Masked Somatic Mutation")

    下载CNV 数据

    walk(gdc_projects, download, folder_name = "CNV", data_category = "Copy Number Variation", data_type = "Masked Copy Number Segment")

    下载蛋白表达量数据

    walk(gdc_projects, download, folder_name = "Protein", data_category = "Proteome Profiling", data_type = "Protein Expression Quantification”)

    相关文章

      网友评论

          本文标题:数据挖掘:是时候更新一下TCGA的数据了

          本文链接:https://www.haomeiwen.com/subject/fgkdfjtx.html