美文网首页科研信息学
mircode数据库遇见R语言

mircode数据库遇见R语言

作者: 小洁忘了怎么分身 | 来源:发表于2020-06-25 21:59 被阅读0次

本文使用的文件和脚本,我已经打包好。请在生信星球公众号后台回复“code658”获取。 mircode也是常用的查询lncRNA和miRNA关系的数据库。我下载了它的互作关系数据,放在R语言整理一下方便批量查询。

1.读取mircode的lncRNA数据

rm(list = ls())
options(stringsAsFactors = F)
mircode = read.delim("mircode_highconsfamilies.txt.gz",stringsAsFactors = F);dim(mircode)
## [1] 1329071      12

2.将lncRNA的数据挑选出来

colnames(mircode)
##  [1] "gene_id"            "gene_symbol"        "gene_class"        
##  [4] "microrna"           "seed_pos"           "seed_type"         
##  [7] "repeat."            "total_cons_."       "primates_cons_."   
## [10] "mammals_cons_."     "vertebrates_cons_." "tr_region"
table(mircode$gene_class)
## 
##              coding  lncRNA, intergenic lncRNA, overlapping               other 
##             1018974               74646               64237               40266 
##          pseudogene 
##              130948

可以看到,gene_class里有两类属于lncRNA,区分了intergenic和overlapping,将他们对应的行单独挑选出来。

library(dplyr)
lncmiRcode = mircode[mircode$gene_class %in%c("lncRNA, intergenic","lncRNA, overlapping"),1:4];dim(lncmiRcode)
## [1] 138883      4

3.拆分mir-lncRNA关系对

看一下它的mircorna列,有一些合并在一起了:

head(lncmiRcode$microrna)
## [1] "miR-503"                                
## [2] "miR-145"                                
## [3] "miR-15abc/16/16abc/195/322/424/497/1907"
## [4] "miR-103a/107/107ab"                     
## [5] "miR-383"                                
## [6] "miR-128/128ab"

稍微了解一下mirna的命名规则,它并不都是以miR开头的:

library(stringr)
p1 = str_starts(lncmiRcode$microrna,"miR-");table(p1)
## p1
##  FALSE   TRUE 
##   1221 137662
p2 = str_starts(lncmiRcode$microrna,"let-");table(p2)
## p2
##  FALSE   TRUE 
## 137662   1221

是miR和let两种前缀,这里先将他们去掉,然后按照斜杠作为分隔符分割为单独的miRNA,再将前缀加回来。

#移除前缀,拆开后再补上
mis = lncmiRcode$microrna %>% 
  str_remove("miR-") %>% 
  str_remove("let-");head(mis,10)
##  [1] "503"                                       
##  [2] "145"                                       
##  [3] "15abc/16/16abc/195/322/424/497/1907"       
##  [4] "103a/107/107ab"                            
##  [5] "383"                                       
##  [6] "128/128ab"                                 
##  [7] "503"                                       
##  [8] "503"                                       
##  [9] "130ac/301ab/301b/301b-3p/454/721/4295/3666"
## [10] "7/7ab"
# 按/为分隔符分割,并返回每行有多少个miRNA
mis = str_split(mis,"/") ;head(mis,10)
## [[1]]
## [1] "503"
## 
## [[2]]
## [1] "145"
## 
## [[3]]
## [1] "15abc" "16"    "16abc" "195"   "322"   "424"   "497"   "1907" 
## 
## [[4]]
## [1] "103a"  "107"   "107ab"
## 
## [[5]]
## [1] "383"
## 
## [[6]]
## [1] "128"   "128ab"
## 
## [[7]]
## [1] "503"
## 
## [[8]]
## [1] "503"
## 
## [[9]]
## [1] "130ac"   "301ab"   "301b"    "301b-3p" "454"     "721"     "4295"   
## [8] "3666"   
## 
## [[10]]
## [1] "7"   "7ab"
ln = sapply(mis, length);head(ln)
## [1] 1 1 8 3 1 2
#把去掉的前缀再加回来
ps = list()
for(i in 1: length(ln)){
  if(p1[[i]]){
    ps[[i]] <- paste0("hsa-miR-",mis[[i]])
  }else{
    ps[[i]] <- paste0("hsa-let-",mis[[i]])
  }
}

4.将整个表格重构

前面拆分字符串时,一起返回了列表每个元素的长度ln,那就是原表格每一行应该重复的次数。

lncmiRcode2 = data.frame(gene_id = rep(lncmiRcode$gene_id,times = ln),
                         gene_symbol = rep(lncmiRcode$gene_symbol,times = ln),
                         gene_class = rep(lncmiRcode$gene_class,times = ln),
                         microrna = unlist(ps))
head(lncmiRcode2);dim(lncmiRcode2)
##             gene_id  gene_symbol          gene_class      microrna
## 1 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping   hsa-miR-503
## 2 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping   hsa-miR-145
## 3 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-15abc
## 4 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping    hsa-miR-16
## 5 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-16abc
## 6 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping   hsa-miR-195
## [1] 359429      4

发现有一些关系对重复出现,按symbol-mirna两列去除重复

nrow(count(lncmiRcode2,gene_symbol,microrna))
## [1] 291978
lncmiRcode2 = distinct(lncmiRcode2,gene_symbol,microrna,.keep_all = T);nrow(lncmiRcode2)
## [1] 291978
head(lncmiRcode2)
##             gene_id  gene_symbol          gene_class      microrna
## 1 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping   hsa-miR-503
## 2 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping   hsa-miR-145
## 3 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-15abc
## 4 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping    hsa-miR-16
## 5 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping hsa-miR-16abc
## 6 ENSG00000083622.8 AC000111.6.1 lncRNA, overlapping   hsa-miR-195
save(lncmiRcode2,file = "lncmiRcode2.Rdata")

5.可选:结合genecode注释

有的文献里会把GENEOCDE注释里没有的lncRNA去掉。其实mircode数据库比较陈旧了,用的是v11版本,参考基因组在变化,有点out。

这里使用的是genecodev22版本的gtf,是TCGA数据库对应的版本,严谨点是用v11才对。

load("anno.Rdata")
lncmiRcode3 = lncmiRcode2[lncmiRcode2$gene_symbol %in% lnc_anno$gene_name,];dim(lncmiRcode3)
## [1] 25837     4
#看看去掉了多少个lncRNA?数量惊人。
length(unique(lncmiRcode2$gene_symbol))
## [1] 10349
length(unique(lncmiRcode3$gene_symbol))
## [1] 730

一点探索

我想会不会是symbol的名称的变化导致的匹配数量太少?试一下按照gene_id。带着版本号的统计结果:

table(lncmiRcode2$gene_id %in% lnc_anno$gene_id)
## 
##  FALSE   TRUE 
## 126955 165023
table(lnc_anno$gene_id %in% lncmiRcode2$gene_id)
## 
## FALSE  TRUE 
##  7942  6884

不带版本号的统计结果:

x1  = str_remove(lncmiRcode2$gene_id,"\\.\\d")
x2  = str_remove(lnc_anno$gene_id,"\\.\\d")
table(x1 %in% x2)
## 
##  FALSE   TRUE 
##  31910 260068
table(x2 %in% x1)
## 
## FALSE  TRUE 
##  5552  9274

反正,不是很严谨。如果想要按照geneid匹配,操作上也可以实现。更期待的是这个数据库可以更新一下,这样我们就不用管什么版本了。

相关文章

网友评论

    本文标题:mircode数据库遇见R语言

    本文链接:https://www.haomeiwen.com/subject/llsefktx.html