美文网首页ggplot集锦
R批量查看基因注释数据

R批量查看基因注释数据

作者: 哇珍 | 来源:发表于2021-07-07 19:59 被阅读0次
    setwd("~/wangzh/批量爬取NCBI基因注释数据")
    rm(list=ls())
    
    library(RCurl)
    library(stringr)
    library(XML)
    library(clusterProfiler)
    
    # 读入基因列表:
    library(readr)
    test_genes <- read_csv("test_genes.csv")
    names(test_genes)[2] <-"gene"#对数据框的列名进行重命名
    test_genes =sapply(strsplit(test_genes$gene,":precursor"),"[",1)#"["代表提取,1代表分开的第一个字符串
    test_genes =paste0("MIR",test_genes)
    class(test_genes)
    
    # 将gene symbol转为entrze ID
    genes <- bitr(test_genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
    # 网址数据框
    genes$NCBI_url <- paste("https://www.ncbi.nlm.nih.gov/gene/",genes$ENTREZID,sep="")
    head(genes)
    
    #  GeneRIF的xpath:  //*[@id="padded_content"]/div[6]/div[2]/div[4]/div/div/div[2]/div
    
    # 根据xpath获取节点内容:
    getNodesTxt <- function(html_txt1,xpath_p){
      els1 = getNodeSet(html_txt1, xpath_p)
      # 获得Node的内容,并且去除空字符:
      els1_txt <- sapply(els1,xmlValue)[!(sapply(els1,xmlValue)=="")]
      # 去除\n:
      str_replace_all(els1_txt,"(\\n )+","")
    }
    
    # 处理节点格式,为character且长度为0的赋值为NA:
    dealNodeTxt <- function(NodeTxt){
      ifelse(is.character(NodeTxt)==T && length(NodeTxt)!=0 , NodeTxt , NA)
    }
    
    for(i in 1:nrow(genes)){
    # 获得网址:
    doc <- getURL(genes[i,"NCBI_url"])
    cat("成功获得网页!\t")
    # 获得网页内容
    html_txt1 = htmlParse(doc, asText = TRUE)
    
    # 获得Full Name:
    genes[i,"FullName"] <- str_split(dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[contains(text(),"Symbol") and position()=1 ] ]')),"provided")[[1]][1]
    cat("写入基因\t")
    # 获得HGNC ID:
    #genes[i,"HGNC_ID"] <- str_replace_all(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Primary source" and position()=1 ] ]')," |HGNC|:","")
    #cat("写入HGNC_ID\t")
    # 获得Also known as:
    genes[i,"Also_known_as"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Also known as" and position()=1 ] ]'))
    cat("写入Also_known_as\t")
    # 获得Gene type:
    genes[i,"GeneType"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Gene type" and position()=1 ] ]'))
    cat("写入GeneType\t")
    # 获得Expression:
    genes[i,"Expression"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Expression" and position()=1 ] ]'))
    cat("写入Expression\n")
    # 获得summary:
    genes[i,"Summary"] <- dealNodeTxt(getNodesTxt(html_txt1,'//*[@id="summaryDl"]/dd[preceding-sibling::dt[text()="Summary" and position()=1 ] ]'))
    cat("写入Summary\n")
    
    print(paste("完成第",i,"个了!"))
    }
    

    原文转自:生信杂谈的R语言批量爬取NCBI基因注释数据

    相关文章

      网友评论

        本文标题:R批量查看基因注释数据

        本文链接:https://www.haomeiwen.com/subject/yzogultx.html