美文网首页
从科学网爬取国自然基金信息

从科学网爬取国自然基金信息

作者: 一只烟酒僧 | 来源:发表于2020-04-07 17:07 被阅读0次
######################################################## 
#-------------------------------------------------------
# Topic:国自然基金爬虫
# Author:
# Date:Tue Apr 07 10:19:09 2020
# Mail:
#-------------------------------------------------------
########################################################

 

get_fund_info<-function(id,start_year=2015,end_year=2020){
  if(!require(stringr))install.packages("stringr")
  if(!require(rvest))install.packages("rvest")
  if(!require(curl))install.packages("curl")
  
    
  library(stringr)
  library(rvest)
  library(curl)
  information<-data.frame()
  index=0
  for (m in start_year:end_year) {
    index=index+1
    if(index==20){
      print("为防止被封IP,自动进入休眠五分钟")
      Sys.sleep(300)
      index=0
      
    }
    #获取最大页数
    url1<-paste("http://fund.sciencenet.cn/search/smallSubject?subject=",id,
                "&yearStart=",m,
                "&yearEnd=",m,
                "&submit=list",sep = "")
    FUND<-html_session(url1)
    FUND<-read_html(FUND)
    total_page<-'//p[@id="page_button2"]/span'
    total_page<-html_nodes(FUND,xpath = total_page)%>%html_text(trim = T)
    total_page<-as.numeric(total_page)
    total_page<-max(total_page[!is.na(total_page)])
    if(total_page==-Inf){total_page=1}#会有信息不足一页的情况
    #获取项目总数
    pro_num<-'//span[@class="l"]/b[1]'
    pro_num<-html_nodes(FUND,xpath = pro_num)%>%html_text(trim = T)
    pro_num<-as.numeric(pro_num)
    if(pro_num>200){print(paste(m,"年","项目多于200,因此只展示前两百",sep=""))}
    
    
    #爬取需要信息!
    for (i in 1:total_page) {
      index=index+1
      if(index==20){
        print("为防止被封IP,自动进入休眠五分钟")
        Sys.sleep(300)
        index=0
        
      }
      url<-paste("http://fund.sciencenet.cn/search/smallSubject?subject=",id,
                 "&yearStart=",m,
                 "&yearEnd=",m,
                 "&submit=list&page=",i,sep = "")
      print(paste("正在访问:",url,sep=""))
      FUND<-html_session(url)
      FUND<-read_html(FUND)
      title<-'//p[@class="t"]/a'
      author<-'//span[@class="author"]/i'
      danwei<-'//span[@class="author"]/following-sibling::span/i'
      type<-'//span[@class="author"]/following-sibling::i'
      ID<-'//span[@class="author"]/following-sibling::b'
      year<-'//span[@class="author"]/following-sibling::span/b'
      money<-'//p[@class="ico"]/following-sibling::p//b'
      abstract='//p[@class="t"]/a'
      information1<-data.frame(Category=id,
                               Title=html_nodes(FUND,xpath =title )%>%html_text(trim = T),
                               Author=html_nodes(FUND,xpath =author )%>%html_text(trim = T),
                               Institution=html_nodes(FUND,xpath =danwei )%>%html_text(trim = T),
                               Type=html_nodes(FUND,xpath =type )%>%html_text(trim = T),
                               ID=html_nodes(FUND,xpath =ID )%>%html_text(trim = T),
                               Year=html_nodes(FUND,xpath =year )%>%html_text(trim = T),
                               Money=html_nodes(FUND,xpath =money )%>%html_text(trim = T),
                               Abstract_url=html_nodes(FUND,xpath = abstract)%>%html_attr(name="href"))
      if(is.na(information1$Title[1])){print(paste("第",i,"页爬取失败,返回空内容",sep=""))}else{print(paste("第",i,"页爬取成功",sep=""))}
      information<-rbind(information,information1) 
    }
    
  }
  return(information)
}
#示例:
#Sys.sleep(600)
my_infor<-get_fund_info("C1201",2018,2018)

相关文章

网友评论

      本文标题:从科学网爬取国自然基金信息

      本文链接:https://www.haomeiwen.com/subject/eqinphtx.html