#### 计算程序的运行时间
timestart<-Sys.time();
#打印开始时间
print(timestart)
####这块写你要运行的程序
getwd()
setwd("./我的R/RCurl包学习/")
library(RCurl)
library(XML)
library(stringr)
myheader<-c(
"User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
url <- "http://www.zbjuran.com/mei/"
url_ori <- "http://www.zbjuran.com"
wp<-getURL(url,.encoding="gb2312",followlocation=T) #用网页本身的编码
doc <- htmlParse(wp,asText=T,encoding="UTF-8")#解析
sex_url <- xpathSApply(doc,path = "//a",xmlGetAttr,"href")
sex_url <- sex_url[which(str_detect(sex_url,"[0-9]{4}"))]
sex_url <- paste(url_ori,sex_url,sep = "")
sex_url <- as.factor(sex_url)
sex_url <- levels(sex_url)
x <- NULL;
for(j in 2:30)
{
rp <- paste("_",j,".html",sep = "")
x <- c(x,str_replace(sex_url,pattern = ".html",replacement = rp))
}
sex_url <- c(x,sex_url)
i=1
for(i in 1:length(sex_url))
{
if(url.exists(sex_url[i])){
wp1<-getURL(sex_url[i],.encoding="gb2312",followlocation=T) #用网页本身的编码
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
url_pic <- paste(url_ori,xpathSApply(doc1,path = "//div[@class='picbox']//img",xmlGetAttr,"src"),sep = "")
temp <- getBinaryURL(url_pic);
#下载每个.R文件的二进制html文件
note <- file(str_split(url_pic,pattern = "/")[[1]][7],open="wb")
#打开文件,对该文件进行二进制写入操作,文件的名字为.R文件的名字
writeBin(temp,note)
#将temp文件写入连接note中
close(note)
#关闭文件,运行成功后即可下载完毕
#Sys.sleep(time = 30)
#每爬一次休息一会儿
}
}
#计算程序结束时间
timeend<-Sys.time()
#打印结束时间
print(timeend)
runningtime<-timeend-timestart
#输出时间消耗
print(runningtime)
网友评论