setwd("~/wangzh/批量爬取NCBI基因注释数据")
rm(list=ls())
library(RCurl)
library(stringr)
library(XML)
library(clusterProfiler)
# 读入基因列表:
library(readr)
test_genes <- read_csv("test_genes.csv")
names(test_genes)[2] <-"gene"#对数据框的列名进行重命名
test_genes =sapply(strsplit(test_genes$gene,":precursor"),"[",1)#"["代表提取,1代表分开的第一个字符串
test_genes =paste0("MIR",test_genes)
class(test_genes)
# 将gene symbol转为entrze ID
genes <- bitr(test_genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
# 网址数据框
genes$NCBI_url <- paste("https://www.ncbi.nlm.nih.gov/gene/",genes$ENTREZID,sep="")
head(genes)
# GeneRIF的xpath: //*[@id="padded_content"]/div[6]/div[2]/div[4]/div/div/div[2]/div
# 根据xpath获取节点内容:
getNodesTxt <- function(html_txt1,xpath_p){
els1 = getNodeSet(html_txt1, xpath_p)
# 获得Node的内容,并且去除空字符:
els1_txt <- sapply(els1,xmlValue)[!(sapply(els1,xmlValue)=="")]
# 去除\n:
str_replace_all(els1_txt,"(\\n )+","")
}
# 处理节点格式,为character且长度为0的赋值为NA:
dealNodeTxt <- function(NodeTxt){
ifelse(is.character(NodeTxt)==T && length(NodeTxt)!=0 , NodeTxt , NA)
}
for(i in 1:nrow(genes)){
# 获得网址:
doc <- getURL(genes[i,"NCBI_url"])
cat("成功获得网页!\t")
# 获得网页内容
html_txt1 = htmlParse(doc, asText = TRUE)
# 获得Full Name:
genes[i,"FullName"] <- str_split(dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[contains(text(),"Symbol") and position()=1 ] ]')),"provided")[[1]][1]
cat("写入基因\t")
# 获得HGNC ID:
#genes[i,"HGNC_ID"] <- str_replace_all(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Primary source" and position()=1 ] ]')," |HGNC|:","")
#cat("写入HGNC_ID\t")
# 获得Also known as:
genes[i,"Also_known_as"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Also known as" and position()=1 ] ]'))
cat("写入Also_known_as\t")
# 获得Gene type:
genes[i,"GeneType"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Gene type" and position()=1 ] ]'))
cat("写入GeneType\t")
# 获得Expression:
genes[i,"Expression"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Expression" and position()=1 ] ]'))
cat("写入Expression\n")
# 获得summary:
genes[i,"Summary"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Summary" and position()=1 ] ]'))
cat("写入Summary\n")
print(paste("完成第",i,"个了!"))
}
原文转自:生信杂谈的R语言批量爬取NCBI基因注释数据
网友评论