R批量查看基因注释数据

作者: 哇珍 | 来源:发表于2021-07-07 19:59 被阅读0次

R批量查看基因注释数据
使用R语言爬取DailyMed药物信息
R语言批量爬取NCBI基因注释数据
Python爬虫获取geneID对应的NCBI注释
RNA-seq 详细教程：注释（15）
批量注释基因到基因座上(map gene to locus)
2020-11-18
python单细胞测序分析教程-0 | 数据结构及操作
批量转换基因名
【linux编程】Linux文本处理三剑客——grep

setwd("~/wangzh/批量爬取NCBI基因注释数据")
rm(list=ls())

library(RCurl)
library(stringr)
library(XML)
library(clusterProfiler)

# 读入基因列表：
library(readr)
test_genes <- read_csv("test_genes.csv")
names(test_genes)[2] <-"gene"#对数据框的列名进行重命名
test_genes =sapply(strsplit(test_genes$gene,":precursor"),"[",1)#"["代表提取,1代表分开的第一个字符串
test_genes =paste0("MIR",test_genes)
class(test_genes)

# 将gene symbol转为entrze ID
genes <- bitr(test_genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
# 网址数据框
genes$NCBI_url <- paste("https://www.ncbi.nlm.nih.gov/gene/",genes$ENTREZID,sep="")
head(genes)

#  GeneRIF的xpath：  //*[@id="padded_content"]/div[6]/div[2]/div[4]/div/div/div[2]/div

# 根据xpath获取节点内容：
getNodesTxt <- function(html_txt1,xpath_p){
  els1 = getNodeSet(html_txt1, xpath_p)
  # 获得Node的内容，并且去除空字符：
  els1_txt <- sapply(els1,xmlValue)[!(sapply(els1,xmlValue)=="")]
  # 去除\n：
  str_replace_all(els1_txt,"(\\n )+","")
}

# 处理节点格式，为character且长度为0的赋值为NA：
dealNodeTxt <- function(NodeTxt){
  ifelse(is.character(NodeTxt)==T && length(NodeTxt)!=0 , NodeTxt , NA)
}

for(i in 1:nrow(genes)){
# 获得网址：
doc <- getURL(genes[i,"NCBI_url"])
cat("成功获得网页！\t")
# 获得网页内容
html_txt1 = htmlParse(doc, asText = TRUE)

# 获得Full Name:
genes[i,"FullName"] <- str_split(dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[contains(text(),"Symbol") and position()=1 ] ]')),"provided")[[1]][1]
cat("写入基因\t")
# 获得HGNC ID:
#genes[i,"HGNC_ID"] <- str_replace_all(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Primary source" and position()=1 ] ]')," |HGNC|:","")
#cat("写入HGNC_ID\t")
# 获得Also known as:
genes[i,"Also_known_as"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Also known as" and position()=1 ] ]'))
cat("写入Also_known_as\t")
# 获得Gene type:
genes[i,"GeneType"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Gene type" and position()=1 ] ]'))
cat("写入GeneType\t")
# 获得Expression：
genes[i,"Expression"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Expression" and position()=1 ] ]'))
cat("写入Expression\n")
# 获得summary：
genes[i,"Summary"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Summary" and position()=1 ] ]'))
cat("写入Summary\n")

print(paste("完成第",i,"个了！"))
}

原文转自：生信杂谈的R语言批量爬取NCBI基因注释数据