TCGA的下载方式更新了,miRNA也跟着动一下。
1.查看TCGA的33个癌种
rm(list = ls())
library(TCGAbiolinks)
library(stringr)
library(SummarizedExperiment)
projs <- getGDCprojects()$project_id %>%
str_subset("TCGA")
projs
## [1] "TCGA-CHOL" "TCGA-LIHC" "TCGA-DLBC" "TCGA-BLCA" "TCGA-ACC"
## [6] "TCGA-CESC" "TCGA-PCPG" "TCGA-PAAD" "TCGA-MESO" "TCGA-TGCT"
## [11] "TCGA-KIRP" "TCGA-UVM" "TCGA-UCS" "TCGA-THYM" "TCGA-COAD"
## [16] "TCGA-ESCA" "TCGA-GBM" "TCGA-KICH" "TCGA-HNSC" "TCGA-PRAD"
## [21] "TCGA-OV" "TCGA-LUSC" "TCGA-LAML" "TCGA-LGG" "TCGA-SARC"
## [26] "TCGA-BRCA" "TCGA-READ" "TCGA-LUAD" "TCGA-STAD" "TCGA-THCA"
## [31] "TCGA-KIRC" "TCGA-SKCM" "TCGA-UCEC"
2.下载数据
使用的是TCGAbiolinks里面的函数,受网络限制,有可能下载不成功。
project = "TCGA-ESCA"
if(F){
query.isoform <- GDCquery(
project = project,
experimental.strategy = "miRNA-Seq",
data.category = "Transcriptome Profiling",
data.type = "Isoform Expression Quantification"
)
GDCdownload(query.isoform)
isoform <- GDCprepare(
query = query.isoform,
save = TRUE,
save.filename = paste0(project,"_isoform.Rdata")
)
}
3.整理数据
rm(list = ls())
load("TCGA-ESCA_isoform.Rdata")
library(tibble)
library(dplyr)
library(tidyr)
head(data)
## # A tibble: 6 × 7
## miRNA_ID isoform_coords read_…¹ reads…² cross…³ miRNA…⁴ barcode
## <chr> <chr> <int> <dbl> <chr> <chr> <chr>
## 1 hsa-let-7a-1 hg38:chr9:941… 1 0.316 N precur… TCGA-V…
## 2 hsa-let-7a-1 hg38:chr9:941… 1 0.316 N precur… TCGA-V…
## 3 hsa-let-7a-1 hg38:chr9:941… 1 0.316 N mature… TCGA-V…
## 4 hsa-let-7a-1 hg38:chr9:941… 2 0.631 N mature… TCGA-V…
## 5 hsa-let-7a-1 hg38:chr9:941… 5 1.58 N mature… TCGA-V…
## 6 hsa-let-7a-1 hg38:chr9:941… 33 10.4 N mature… TCGA-V…
## # … with abbreviated variable names ¹read_count,
## # ²reads_per_million_miRNA_mapped, ³`cross-mapped`, ⁴miRNA_region
colnames(data)
## [1] "miRNA_ID" "isoform_coords"
## [3] "read_count" "reads_per_million_miRNA_mapped"
## [5] "cross-mapped" "miRNA_region"
## [7] "barcode"
exp = data %>%
dplyr::select(c(miRNA_region,read_count,barcode)) %>%
group_by(barcode,miRNA_region) %>%
summarise(read_count = sum(read_count)) %>%
pivot_wider(names_from = barcode,
values_from = read_count) %>%
column_to_rownames("miRNA_region") %>%
as.matrix()
exp[is.na(exp)] = 0
exp[1:4,1:4]
## TCGA-2H-A9GF-01A-11R-A37J-13
## mature,MIMAT0000062 176657
## mature,MIMAT0000063 25442
## mature,MIMAT0000064 6623
## mature,MIMAT0000065 813
## TCGA-2H-A9GG-01A-11R-A37J-13
## mature,MIMAT0000062 127124
## mature,MIMAT0000063 33978
## mature,MIMAT0000064 2488
## mature,MIMAT0000065 808
## TCGA-2H-A9GH-01A-11R-A37J-13
## mature,MIMAT0000062 124680
## mature,MIMAT0000063 18955
## mature,MIMAT0000064 3896
## mature,MIMAT0000065 662
## TCGA-2H-A9GI-01A-11R-A37J-13
## mature,MIMAT0000062 212221
## mature,MIMAT0000063 25107
## mature,MIMAT0000064 1412
## mature,MIMAT0000065 1874
##### 简化行名,去掉前缀
k = str_detect(rownames(exp),"mature,");table(k)
## k
## FALSE TRUE
## 3 2082
rownames(exp)[!k]
## [1] "precursor" "stemloop" "unannotated"
exp = exp[k,]
rownames(exp) = str_remove(rownames(exp),"mature,")
##### 找到行名的对应关系
library(miRBaseVersions.db)
mh <- select(miRBaseVersions.db,
keys = rownames(exp),
keytype = "MIMAT",
columns = c("ACCESSION","NAME","VERSION"))
head(mh)
## ACCESSION NAME VERSION
## 1 MIMAT0000062 hsa-let-7a-5p 22
## 2 MIMAT0000063 hsa-let-7b-5p 22
## 3 MIMAT0000064 hsa-let-7c-5p 22
## 4 MIMAT0000065 hsa-let-7d-5p 22
## 5 MIMAT0000066 hsa-let-7e-5p 22
## 6 MIMAT0000067 hsa-let-7f-5p 22
mh = mh[mh$VERSION=="21",]
table(rownames(exp) %in% mh$ACCESSION)
##
## TRUE
## 2082
# 转换行名
library(tinyarray)
exp = trans_array(exp,mh,from = "ACCESSION",to = "NAME")
exp[1:4,1:4]
## TCGA-2H-A9GF-01A-11R-A37J-13
## hsa-let-7a-5p 176657
## hsa-let-7b-5p 25442
## hsa-let-7c-5p 6623
## hsa-let-7d-5p 813
## TCGA-2H-A9GG-01A-11R-A37J-13
## hsa-let-7a-5p 127124
## hsa-let-7b-5p 33978
## hsa-let-7c-5p 2488
## hsa-let-7d-5p 808
## TCGA-2H-A9GH-01A-11R-A37J-13
## hsa-let-7a-5p 124680
## hsa-let-7b-5p 18955
## hsa-let-7c-5p 3896
## hsa-let-7d-5p 662
## TCGA-2H-A9GI-01A-11R-A37J-13
## hsa-let-7a-5p 212221
## hsa-let-7b-5p 25107
## hsa-let-7c-5p 1412
## hsa-let-7d-5p 1874
这样就整理完成咯。
网友评论