biomaRt 是R语言访问 BioMart software suite 数据库(例如 Ensembl, Uniprot, HapMap)的API接口
library(biomaRt);library(dplyr)
ensembl = useEnsembl("ensembl",host = "https://dec2021.archive.ensembl.org/") #获取数据库
species.1="mouse"
species.2 ="tguttata"
#在数据库中模糊匹配物种名(常用拉丁名)来获取的物种数据集的索引名
listDatasets(ensembl) %>% filter_all( any_vars(grepl("species.1", .)) )
listDatasets(ensembl) %>% filter_all( any_vars(grepl("species.2", .)) )
#以小鼠为例,通过模糊匹配"Mouse"可知小鼠GRCm39基因组对应的数据集索引名为:"mmusculus_gene_ensembl"
> datasets %>% filter_all( any_vars(grepl(species.1, .)) )
dataset description version
1 mmurinus_gene_ensembl Mouse Lemur genes (Mmur_3.0) Mmur_3.0
2 mmusculus_gene_ensembl Mouse genes (GRCm39) GRCm39
#提取查询物种的基因信息数据集
mouse.geneset <- useMart('ensembl',dataset = "mmusculus_gene_ensembl",host = "https://dec2021.archive.ensembl.org/")
tguttata.geneset <- useMart('ensembl',dataset = "tguttata_gene_ensembl",host = "https://dec2021.archive.ensembl.org/")
##在数据集内查询gene symbol 的属性名,还是通过模糊匹配的方式
> listAttributes(Mice) %>% filter_all( any_vars(grepl("symbol", .,ignore.case = T)) )
name description page
1 hgnc_symbol HGNC symbol feature_page
2 mgi_symbol MGI symbol feature_page
3 uniprot_gn_symbol UniProtKB Gene Name symbol feature_page
# 对于小鼠的gene symbol一般是mgi_symbol(看自己查询的基因集的命名方式来选择),小鼠基因基因有自己的一套基因命名系统;
# 而一些没有独自命名系统的物种一般用的是hgnc_symbol(人类基因的命名系统)来标识基因。
##跨物种同源基因查询,基于tguttata的基因集查询每个基因对应的小鼠基因
tguttata.query_sets <- c("COPG1","VLDLR","TBC1D22A")
tguttata2mouse <- tguttata.query_sets <- c("COPG1","VLDLR","TBC1D22A")
getLDS(values = tguttata.query_sets,
attributes = c("hgnc_symbol"),filters = "hgnc_symbol",mart = tguttata.geneset,
attributesL = c("mgi_symbol"), martL = mouse.geneset,uniqueRows = T)
> tguttata2mouse
HGNC.symbol MGI.symbol
1 COPG1 Copg1
2 TBC1D22A Tbc1d22a
3 VLDLR Vldlr
网友评论