美文网首页
听说你想要TCGA的新版miRNA表达数据

听说你想要TCGA的新版miRNA表达数据

作者: 小洁忘了怎么分身 | 来源:发表于2023-03-09 13:43 被阅读0次

TCGA的下载方式更新了,miRNA也跟着动一下。

1.查看TCGA的33个癌种

rm(list = ls())
library(TCGAbiolinks)
library(stringr)
library(SummarizedExperiment)
projs <- getGDCprojects()$project_id %>% 
  str_subset("TCGA")
projs
## [1] "TCGA-CHOL" "TCGA-LIHC" "TCGA-DLBC" "TCGA-BLCA" "TCGA-ACC" 
## [6] "TCGA-CESC" "TCGA-PCPG" "TCGA-PAAD" "TCGA-MESO" "TCGA-TGCT"
## [11] "TCGA-KIRP" "TCGA-UVM"  "TCGA-UCS"  "TCGA-THYM" "TCGA-COAD"
## [16] "TCGA-ESCA" "TCGA-GBM"  "TCGA-KICH" "TCGA-HNSC" "TCGA-PRAD"
## [21] "TCGA-OV"   "TCGA-LUSC" "TCGA-LAML" "TCGA-LGG"  "TCGA-SARC"
## [26] "TCGA-BRCA" "TCGA-READ" "TCGA-LUAD" "TCGA-STAD" "TCGA-THCA"
## [31] "TCGA-KIRC" "TCGA-SKCM" "TCGA-UCEC"

2.下载数据

使用的是TCGAbiolinks里面的函数,受网络限制,有可能下载不成功。

project = "TCGA-ESCA"
if(F){
  query.isoform <- GDCquery(
    project = project, 
    experimental.strategy = "miRNA-Seq",
    data.category = "Transcriptome Profiling", 
    data.type = "Isoform Expression Quantification"
  )
  GDCdownload(query.isoform)
  
  isoform <- GDCprepare(
    query = query.isoform,
    save = TRUE, 
    save.filename = paste0(project,"_isoform.Rdata")
  )  
}

3.整理数据

rm(list = ls())
load("TCGA-ESCA_isoform.Rdata")
library(tibble)
library(dplyr)
library(tidyr)
head(data)

## # A tibble: 6 × 7
##   miRNA_ID     isoform_coords read_…¹ reads…² cross…³ miRNA…⁴ barcode
##   <chr>        <chr>            <int>   <dbl> <chr>   <chr>   <chr>  
## 1 hsa-let-7a-1 hg38:chr9:941…       1   0.316 N       precur… TCGA-V…
## 2 hsa-let-7a-1 hg38:chr9:941…       1   0.316 N       precur… TCGA-V…
## 3 hsa-let-7a-1 hg38:chr9:941…       1   0.316 N       mature… TCGA-V…
## 4 hsa-let-7a-1 hg38:chr9:941…       2   0.631 N       mature… TCGA-V…
## 5 hsa-let-7a-1 hg38:chr9:941…       5   1.58  N       mature… TCGA-V…
## 6 hsa-let-7a-1 hg38:chr9:941…      33  10.4   N       mature… TCGA-V…
## # … with abbreviated variable names ¹read_count,
## #   ²reads_per_million_miRNA_mapped, ³`cross-mapped`, ⁴miRNA_region

colnames(data)

## [1] "miRNA_ID"                       "isoform_coords"                
## [3] "read_count"                     "reads_per_million_miRNA_mapped"
## [5] "cross-mapped"                   "miRNA_region"                  
## [7] "barcode"

exp = data %>%
  dplyr::select(c(miRNA_region,read_count,barcode)) %>% 
  group_by(barcode,miRNA_region) %>% 
  summarise(read_count = sum(read_count)) %>% 
  pivot_wider(names_from = barcode,
              values_from = read_count) %>% 
  column_to_rownames("miRNA_region") %>% 
  as.matrix()
exp[is.na(exp)] = 0
exp[1:4,1:4]

##                     TCGA-2H-A9GF-01A-11R-A37J-13
## mature,MIMAT0000062                       176657
## mature,MIMAT0000063                        25442
## mature,MIMAT0000064                         6623
## mature,MIMAT0000065                          813
##                     TCGA-2H-A9GG-01A-11R-A37J-13
## mature,MIMAT0000062                       127124
## mature,MIMAT0000063                        33978
## mature,MIMAT0000064                         2488
## mature,MIMAT0000065                          808
##                     TCGA-2H-A9GH-01A-11R-A37J-13
## mature,MIMAT0000062                       124680
## mature,MIMAT0000063                        18955
## mature,MIMAT0000064                         3896
## mature,MIMAT0000065                          662
##                     TCGA-2H-A9GI-01A-11R-A37J-13
## mature,MIMAT0000062                       212221
## mature,MIMAT0000063                        25107
## mature,MIMAT0000064                         1412
## mature,MIMAT0000065                         1874

##### 简化行名,去掉前缀
k = str_detect(rownames(exp),"mature,");table(k)

## k
## FALSE  TRUE 
##     3  2082

rownames(exp)[!k]

## [1] "precursor"   "stemloop"    "unannotated"

exp = exp[k,]
rownames(exp) = str_remove(rownames(exp),"mature,")
##### 找到行名的对应关系
library(miRBaseVersions.db)
mh <- select(miRBaseVersions.db,
               keys = rownames(exp),
               keytype = "MIMAT",
               columns = c("ACCESSION","NAME","VERSION"))
head(mh)

##      ACCESSION          NAME VERSION
## 1 MIMAT0000062 hsa-let-7a-5p      22
## 2 MIMAT0000063 hsa-let-7b-5p      22
## 3 MIMAT0000064 hsa-let-7c-5p      22
## 4 MIMAT0000065 hsa-let-7d-5p      22
## 5 MIMAT0000066 hsa-let-7e-5p      22
## 6 MIMAT0000067 hsa-let-7f-5p      22

mh = mh[mh$VERSION=="21",]
table(rownames(exp) %in% mh$ACCESSION)

## 
## TRUE 
## 2082
# 转换行名
library(tinyarray)
exp = trans_array(exp,mh,from = "ACCESSION",to = "NAME")
exp[1:4,1:4]

##               TCGA-2H-A9GF-01A-11R-A37J-13
## hsa-let-7a-5p                       176657
## hsa-let-7b-5p                        25442
## hsa-let-7c-5p                         6623
## hsa-let-7d-5p                          813
##               TCGA-2H-A9GG-01A-11R-A37J-13
## hsa-let-7a-5p                       127124
## hsa-let-7b-5p                        33978
## hsa-let-7c-5p                         2488
## hsa-let-7d-5p                          808
##               TCGA-2H-A9GH-01A-11R-A37J-13
## hsa-let-7a-5p                       124680
## hsa-let-7b-5p                        18955
## hsa-let-7c-5p                         3896
## hsa-let-7d-5p                          662
##               TCGA-2H-A9GI-01A-11R-A37J-13
## hsa-let-7a-5p                       212221
## hsa-let-7b-5p                        25107
## hsa-let-7c-5p                         1412
## hsa-let-7d-5p                         1874

这样就整理完成咯。

相关文章

网友评论

      本文标题:听说你想要TCGA的新版miRNA表达数据

      本文链接:https://www.haomeiwen.com/subject/ihwxldtx.html