爬取豆瓣id和IMDB_id
#输入电影名字、导演、演员信息,爬取豆瓣id和IMDB_id
#输入信息必须经过严格清洗,不允许出现空格,导演和演员字符长度严格小于等于6
#### 计算程序的运行时间
timestart<-Sys.time();
#打印开始时间
print(timestart)
####这块写你要运行的程序
#报头设置非常重要,爬虫一定要伪装,另外for循环一定要间隔休息
library(xlsx)
library(readxl)
library(plyr)
library(sqldf)
library(data.table)
library(RCurl)
library(XML)
library(stringr)
#伪装报头
myheader<-c(
"User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive",
"Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
)
########定义编辑距离函数#############
Fun <- function(x,y){
library(stringr)
m <- str_length(x)
n <- str_length(y)
x <- str_split(x,pattern = "")[[1]];
y <- str_split(y,pattern = "")[[1]];
M <- matrix(0,nrow = m+1,ncol = n+1);
rownames(M) <- c(" ",x)
colnames(M) <- c(" ",y)
for(i in 1:(m+1)) M[i,1] <- i-1;
for(j in 1:(n+1)) M[1,j] <- j-1;
for(i in 2:(m+1)){
for(j in 2:(n+1)){
if(x[i-1]==y[j-1]) cost=0 else cost=1;
M[i,j]=min(M[i-1,j]+1,M[i,j-1]+1,M[i-1,j-1]+cost)
}
}
#返回字符串的相似度
return(round(1-M[m+1,n+1]/(m+n),2));
}
#url <- "https://movie.douban.com/"
#text=c("碟中谍","狮子王","魔戒3","星际穿越","火星救援","碟中谍2","职业特工队2","谍影重重2","碟中谍5")
#text="哈利波特与魔法石"
#text="少年派的奇幻漂流"
#text="哈利波特与死亡圣器(下)"
#text="手机"
#text <- t(c("加勒比海盗1:黑珍珠号的诅咒","冯小刚"," 张国立葛优范"))
#text <- t(c("哈利波特与死亡圣器(下)" , "大飞", "廖智苗皓钧"))
#输入参数
#text <- y[1:100,c(1,2,3)]
#i=2
#抽样测试
#text <- text[sample(2901,200,replace = F),]
##################测试###########################
#text <- as.data.frame(t(z[1,]),stringsAsFactors = F)
###################匹配分类##############
#A <- "完全匹配"
#B <- "多个匹配但前五结果唯一"
#C <- "返回一个结果标题不匹配但详情页匹配"
#D <- "前五结果多个但是匹配上了"
#E <- "完全不匹配"
#F <- "前五结果多个但是没有匹配上"
url <- paste("https://movie.douban.com/subject_search?search_text=",text[,1],"&cat=1002",sep = "")
url_douban <- NULL;
url_douban_id <- NULL;
imdb_id <- NULL;
class <- NULL;
#输入数据英文小括号改写为正则表达式\\(和\\)
text[,1]<- str_replace(str_replace(text[,1],pattern = "\\(",replacement = "\\\\("),pattern = "\\)",replacement = "\\\\)");
#i=1
for(i in 1:length(url)){
wp<-getURL(url[i],.encoding="utf-8",followlocation=T,httpheader=myheader)
doc <- htmlParse(wp,asText=T,encoding="UTF-8")#解析
#text[i]为电影名字
title<- xpathSApply(doc,"//div[@class='pl2']//a",xmlValue)#搜索页所有的结果标题
title <- str_replace_all(title,pattern = "·",replacement = "")
pipei <- str_detect(title,paste(" ",text[i,1]," {0,2}(\\(.{2,3}\\))?(:.{0,8})?","[\n | /]",sep = ""))
#如果匹配列表个数等于1
if(length(which(pipei))==1){
subscript <- which(pipei)[1]#匹配上电影的下标
url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0){ imdb_id[i] <- "000"}
else{imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
class[i] <- "A"
}
else if(length(which(pipei))>1){
if(length(which(pipei[1:5]))==1){
subscript <- which(pipei)[1]#匹配上电影的下标
url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0) imdb_id[i] <- "000"
else {imdb_id[i] < xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
class[i] <- "B"
}
else{
#########需要进行二次匹配###########
url_pipei <-xpathSApply(doc,"//div[@class='pl2']//a",xmlGetAttr,"href")[which(pipei)];
imdb_id_temp <- NULL;
xishu <- NULL;
for(n in 1:length(url_pipei)){
wp1<-getURL(url_pipei[n],.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")
#导演
daoyan <- xpathSApply(doc1,path = "//div[@id='info']//span",xmlValue)[1]
# if(is.null(daoyan)){daoyan <- ""}
daoyan <- str_replace_all(daoyan,pattern = "导演:","")
daoyan <- str_replace_all(daoyan,pattern = " ","")
daoyan <- str_replace_all(daoyan,pattern = "/","")
if(length(daoyan)==0){daoyan <- " "}
if(str_length(daoyan)>6)
{daoyan <- substring(daoyan,1,6)}
#主演
zhuyan <- xpathSApply(doc1,path = "//div[@id='info']//span[@class='actor']",xmlValue)
if(length(zhuyan)==0){zhuyan <- "abcdef"}
zhuyan <- str_replace_all(zhuyan,pattern = "主演:","")
zhuyan <- str_replace_all(zhuyan,pattern = " ","")
zhuyan <- str_replace_all(zhuyan,pattern = "/","")
if(str_length(zhuyan)>6) {zhuyan <- substring(zhuyan,1,6)}
p <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)
if(length(p)==0){imdb_id_temp[n] <-"000"}
else{imdb_id_temp[n] <-xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
#加权系数计算
xishu[n] <- 0.6*Fun(text[i,2],daoyan)+0.4*Fun(text[i,3],zhuyan)###需要将输入参数改为三个变量的数据框
}
subscript <- which.max(xishu)
if(xishu[subscript]>0.5)
{
url_douban_id[i] <- str_split(url_pipei[subscript],pattern = "/")[[1]][5]
imdb_id[i] <- imdb_id_temp[subscript]
class[i] <- "D"
}
else{
url_douban_id[i] <- 0;
imdb_id[i] <- 0;
class[i] <- "F"
}
}
}
else {
if(length(pipei)==1)
{
url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[1]
wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
if(length(xpathSApply(doc1,path = "//div[@id='info']",xmlValue))!=0){
text_another_name <- str_extract(xpathSApply(doc1,path = "//div[@id='info']",xmlValue),pattern = "又名:.*IMDb链接")
if(is.na(text_another_name)){
text_another_name <- "aaaaaa"
}
}
else{
text_another_name <- "aaaaaa"
}
if(str_detect(text_another_name,text[i,1]))
{
url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))!=0){
imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
else{imdb_id[i] <- "000"}
class[i] <- "C"
}
else {
url_douban_id[i] <- NA;
imdb_id[i] <- NA;
class[i] <- "E";
}
}
else{
url_douban_id[i] <- NA;
imdb_id[i] <- NA;
class[i] <- "E";
}
}
#每一次循环休息2秒左右
Sys.sleep(2+runif(1,0,1))
}
#整理成数据框
x <- data.frame(text[,1],url_douban_id,imdb_id,class)
#如果匹配列表返回值前五个出现相同的匹配结果,则返回id=0;考虑将结果范围缩小到
#如果列表返回值是1,但是不匹配名称,则获得链接,进入详情信息页面
#对搜索列表的电影名称进行精简修改,注意英文名字需要加上分隔符,比如哈利波特、珀西杰克逊等
#标题第二个字段好像是没有进行匹配的,需要进行修改
#计算程序结束时间
timeend<-Sys.time()
#打印结束时间
print(timeend)
runningtime<-timeend-timestart
#输出时间消耗
print(runningtime)
网友评论