美文网首页
从科学网爬取国自然基金信息

从科学网爬取国自然基金信息

作者: 一只烟酒僧 | 来源:发表于2020-04-07 17:07 被阅读0次
    ######################################################## 
    #-------------------------------------------------------
    # Topic:国自然基金爬虫
    # Author:
    # Date:Tue Apr 07 10:19:09 2020
    # Mail:
    #-------------------------------------------------------
    ########################################################
    
     
    
    get_fund_info<-function(id,start_year=2015,end_year=2020){
      if(!require(stringr))install.packages("stringr")
      if(!require(rvest))install.packages("rvest")
      if(!require(curl))install.packages("curl")
      
        
      library(stringr)
      library(rvest)
      library(curl)
      information<-data.frame()
      index=0
      for (m in start_year:end_year) {
        index=index+1
        if(index==20){
          print("为防止被封IP,自动进入休眠五分钟")
          Sys.sleep(300)
          index=0
          
        }
        #获取最大页数
        url1<-paste("http://fund.sciencenet.cn/search/smallSubject?subject=",id,
                    "&yearStart=",m,
                    "&yearEnd=",m,
                    "&submit=list",sep = "")
        FUND<-html_session(url1)
        FUND<-read_html(FUND)
        total_page<-'//p[@id="page_button2"]/span'
        total_page<-html_nodes(FUND,xpath = total_page)%>%html_text(trim = T)
        total_page<-as.numeric(total_page)
        total_page<-max(total_page[!is.na(total_page)])
        if(total_page==-Inf){total_page=1}#会有信息不足一页的情况
        #获取项目总数
        pro_num<-'//span[@class="l"]/b[1]'
        pro_num<-html_nodes(FUND,xpath = pro_num)%>%html_text(trim = T)
        pro_num<-as.numeric(pro_num)
        if(pro_num>200){print(paste(m,"年","项目多于200,因此只展示前两百",sep=""))}
        
        
        #爬取需要信息!
        for (i in 1:total_page) {
          index=index+1
          if(index==20){
            print("为防止被封IP,自动进入休眠五分钟")
            Sys.sleep(300)
            index=0
            
          }
          url<-paste("http://fund.sciencenet.cn/search/smallSubject?subject=",id,
                     "&yearStart=",m,
                     "&yearEnd=",m,
                     "&submit=list&page=",i,sep = "")
          print(paste("正在访问:",url,sep=""))
          FUND<-html_session(url)
          FUND<-read_html(FUND)
          title<-'//p[@class="t"]/a'
          author<-'//span[@class="author"]/i'
          danwei<-'//span[@class="author"]/following-sibling::span/i'
          type<-'//span[@class="author"]/following-sibling::i'
          ID<-'//span[@class="author"]/following-sibling::b'
          year<-'//span[@class="author"]/following-sibling::span/b'
          money<-'//p[@class="ico"]/following-sibling::p//b'
          abstract='//p[@class="t"]/a'
          information1<-data.frame(Category=id,
                                   Title=html_nodes(FUND,xpath =title )%>%html_text(trim = T),
                                   Author=html_nodes(FUND,xpath =author )%>%html_text(trim = T),
                                   Institution=html_nodes(FUND,xpath =danwei )%>%html_text(trim = T),
                                   Type=html_nodes(FUND,xpath =type )%>%html_text(trim = T),
                                   ID=html_nodes(FUND,xpath =ID )%>%html_text(trim = T),
                                   Year=html_nodes(FUND,xpath =year )%>%html_text(trim = T),
                                   Money=html_nodes(FUND,xpath =money )%>%html_text(trim = T),
                                   Abstract_url=html_nodes(FUND,xpath = abstract)%>%html_attr(name="href"))
          if(is.na(information1$Title[1])){print(paste("第",i,"页爬取失败,返回空内容",sep=""))}else{print(paste("第",i,"页爬取成功",sep=""))}
          information<-rbind(information,information1) 
        }
        
      }
      return(information)
    }
    #示例:
    #Sys.sleep(600)
    my_infor<-get_fund_info("C1201",2018,2018)
    

    相关文章

      网友评论

          本文标题:从科学网爬取国自然基金信息

          本文链接:https://www.haomeiwen.com/subject/eqinphtx.html