########################################################
#-------------------------------------------------------
# Topic:国自然基金爬虫
# Author:
# Date:Tue Apr 07 10:19:09 2020
# Mail:
#-------------------------------------------------------
########################################################
get_fund_info<-function(id,start_year=2015,end_year=2020){
if(!require(stringr))install.packages("stringr")
if(!require(rvest))install.packages("rvest")
if(!require(curl))install.packages("curl")
library(stringr)
library(rvest)
library(curl)
information<-data.frame()
index=0
for (m in start_year:end_year) {
index=index+1
if(index==20){
print("为防止被封IP,自动进入休眠五分钟")
Sys.sleep(300)
index=0
}
#获取最大页数
url1<-paste("http://fund.sciencenet.cn/search/smallSubject?subject=",id,
"&yearStart=",m,
"&yearEnd=",m,
"&submit=list",sep = "")
FUND<-html_session(url1)
FUND<-read_html(FUND)
total_page<-'//p[@id="page_button2"]/span'
total_page<-html_nodes(FUND,xpath = total_page)%>%html_text(trim = T)
total_page<-as.numeric(total_page)
total_page<-max(total_page[!is.na(total_page)])
if(total_page==-Inf){total_page=1}#会有信息不足一页的情况
#获取项目总数
pro_num<-'//span[@class="l"]/b[1]'
pro_num<-html_nodes(FUND,xpath = pro_num)%>%html_text(trim = T)
pro_num<-as.numeric(pro_num)
if(pro_num>200){print(paste(m,"年","项目多于200,因此只展示前两百",sep=""))}
#爬取需要信息!
for (i in 1:total_page) {
index=index+1
if(index==20){
print("为防止被封IP,自动进入休眠五分钟")
Sys.sleep(300)
index=0
}
url<-paste("http://fund.sciencenet.cn/search/smallSubject?subject=",id,
"&yearStart=",m,
"&yearEnd=",m,
"&submit=list&page=",i,sep = "")
print(paste("正在访问:",url,sep=""))
FUND<-html_session(url)
FUND<-read_html(FUND)
title<-'//p[@class="t"]/a'
author<-'//span[@class="author"]/i'
danwei<-'//span[@class="author"]/following-sibling::span/i'
type<-'//span[@class="author"]/following-sibling::i'
ID<-'//span[@class="author"]/following-sibling::b'
year<-'//span[@class="author"]/following-sibling::span/b'
money<-'//p[@class="ico"]/following-sibling::p//b'
abstract='//p[@class="t"]/a'
information1<-data.frame(Category=id,
Title=html_nodes(FUND,xpath =title )%>%html_text(trim = T),
Author=html_nodes(FUND,xpath =author )%>%html_text(trim = T),
Institution=html_nodes(FUND,xpath =danwei )%>%html_text(trim = T),
Type=html_nodes(FUND,xpath =type )%>%html_text(trim = T),
ID=html_nodes(FUND,xpath =ID )%>%html_text(trim = T),
Year=html_nodes(FUND,xpath =year )%>%html_text(trim = T),
Money=html_nodes(FUND,xpath =money )%>%html_text(trim = T),
Abstract_url=html_nodes(FUND,xpath = abstract)%>%html_attr(name="href"))
if(is.na(information1$Title[1])){print(paste("第",i,"页爬取失败,返回空内容",sep=""))}else{print(paste("第",i,"页爬取成功",sep=""))}
information<-rbind(information,information1)
}
}
return(information)
}
#示例:
#Sys.sleep(600)
my_infor<-get_fund_info("C1201",2018,2018)
网友评论