本文使用的文件和脚本,我已经打包好。请在生信星球公众号后台回复“code658”获取。 mircode也是常用的查询lncRNA和miRNA关系的数据库。我下载了它的互作关系数据,放在R语言整理一下方便批量查询。
1.读取mircode的lncRNA数据
rm(list = ls())
options(stringsAsFactors = F)
mircode = read.delim("mircode_highconsfamilies.txt.gz",stringsAsFactors = F);dim(mircode)
## [1] 1329071 12
2.将lncRNA的数据挑选出来
colnames(mircode)
## [1] "gene_id" "gene_symbol" "gene_class"
## [4] "microrna" "seed_pos" "seed_type"
## [7] "repeat." "total_cons_." "primates_cons_."
## [10] "mammals_cons_." "vertebrates_cons_." "tr_region"
table(mircode$gene_class)
##
## coding lncRNA, intergenic lncRNA, overlapping other
## 1018974 74646 64237 40266
## pseudogene
## 130948
可以看到,gene_class里有两类属于lncRNA,区分了intergenic和overlapping,将他们对应的行单独挑选出来。
library(dplyr)
lncmiRcode = mircode[mircode$gene_class %in%c("lncRNA, intergenic","lncRNA, overlapping"),1:4];dim(lncmiRcode)
## [1] 138883 4
3.拆分mir-lncRNA关系对
看一下它的mircorna列,有一些合并在一起了:
head(lncmiRcode$microrna)
## [1] "miR-503"
## [2] "miR-145"
## [3] "miR-15abc/16/16abc/195/322/424/497/1907"
## [4] "miR-103a/107/107ab"
## [5] "miR-383"
## [6] "miR-128/128ab"
稍微了解一下mirna的命名规则,它并不都是以miR开头的:
library(stringr)
p1 = str_starts(lncmiRcode$microrna,"miR-");table(p1)
## p1
## FALSE TRUE
## 1221 137662
p2 = str_starts(lncmiRcode$microrna,"let-");table(p2)
## p2
## FALSE TRUE
## 137662 1221
是miR和let两种前缀,这里先将他们去掉,然后按照斜杠作为分隔符分割为单独的miRNA,再将前缀加回来。
#移除前缀,拆开后再补上
mis = lncmiRcode$microrna %>%
str_remove("miR-") %>%
str_remove("let-");head(mis,10)
## [1] "503"
## [2] "145"
## [3] "15abc/16/16abc/195/322/424/497/1907"
## [4] "103a/107/107ab"
## [5] "383"
## [6] "128/128ab"
## [7] "503"
## [8] "503"
## [9] "130ac/301ab/301b/301b-3p/454/721/4295/3666"
## [10] "7/7ab"
# 按/为分隔符分割,并返回每行有多少个miRNA
mis = str_split(mis,"/") ;head(mis,10)
## [[1]]
## [1] "503"
##
## [[2]]
## [1] "145"
##
## [[3]]
## [1] "15abc" "16" "16abc" "195" "322" "424" "497" "1907"
##
## [[4]]
## [1] "103a" "107" "107ab"
##
## [[5]]
## [1] "383"
##
## [[6]]
## [1] "128" "128ab"
##
## [[7]]
## [1] "503"
##
## [[8]]
## [1] "503"
##
## [[9]]
## [1] "130ac" "301ab" "301b" "301b-3p" "454" "721" "4295"
## [8] "3666"
##
## [[10]]
## [1] "7" "7ab"
ln = sapply(mis, length);head(ln)
## [1] 1 1 8 3 1 2
#把去掉的前缀再加回来
ps = list()
for(i in 1: length(ln)){
if(p1[[i]]){
ps[[i]] <- paste0("hsa-miR-",mis[[i]])
}else{
ps[[i]] <- paste0("hsa-let-",mis[[i]])
}
}
4.将整个表格重构
前面拆分字符串时,一起返回了列表每个元素的长度ln,那就是原表格每一行应该重复的次数。
lncmiRcode2 = data.frame(gene_id = rep(lncmiRcode$gene_id,times = ln),
gene_symbol = rep(lncmiRcode$gene_symbol,times = ln),
gene_class = rep(lncmiRcode$gene_class,times = ln),
microrna = unlist(ps))
head(lncmiRcode2);dim(lncmiRcode2)
## gene_id gene_symbol gene_class microrna
## 1 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-503
## 2 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-145
## 3 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-15abc
## 4 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-16
## 5 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-16abc
## 6 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-195
## [1] 359429 4
发现有一些关系对重复出现,按symbol-mirna两列去除重复
nrow(count(lncmiRcode2,gene_symbol,microrna))
## [1] 291978
lncmiRcode2 = distinct(lncmiRcode2,gene_symbol,microrna,.keep_all = T);nrow(lncmiRcode2)
## [1] 291978
head(lncmiRcode2)
## gene_id gene_symbol gene_class microrna
## 1 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-503
## 2 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-145
## 3 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-15abc
## 4 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-16
## 5 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-16abc
## 6 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-195
save(lncmiRcode2,file = "lncmiRcode2.Rdata")
5.可选:结合genecode注释
有的文献里会把GENEOCDE注释里没有的lncRNA去掉。其实mircode数据库比较陈旧了,用的是v11版本,参考基因组在变化,有点out。
这里使用的是genecodev22版本的gtf,是TCGA数据库对应的版本,严谨点是用v11才对。
load("anno.Rdata")
lncmiRcode3 = lncmiRcode2[lncmiRcode2$gene_symbol %in% lnc_anno$gene_name,];dim(lncmiRcode3)
## [1] 25837 4
#看看去掉了多少个lncRNA?数量惊人。
length(unique(lncmiRcode2$gene_symbol))
## [1] 10349
length(unique(lncmiRcode3$gene_symbol))
## [1] 730
一点探索
我想会不会是symbol的名称的变化导致的匹配数量太少?试一下按照gene_id。带着版本号的统计结果:
table(lncmiRcode2$gene_id %in% lnc_anno$gene_id)
##
## FALSE TRUE
## 126955 165023
table(lnc_anno$gene_id %in% lncmiRcode2$gene_id)
##
## FALSE TRUE
## 7942 6884
不带版本号的统计结果:
x1 = str_remove(lncmiRcode2$gene_id,"\\.\\d")
x2 = str_remove(lnc_anno$gene_id,"\\.\\d")
table(x1 %in% x2)
##
## FALSE TRUE
## 31910 260068
table(x2 %in% x1)
##
## FALSE TRUE
## 5552 9274
反正,不是很严谨。如果想要按照geneid匹配,操作上也可以实现。更期待的是这个数据库可以更新一下,这样我们就不用管什么版本了。
网友评论