美文网首页
RCurl包爬取豆瓣电影id和IMDB电影号id

RCurl包爬取豆瓣电影id和IMDB电影号id

作者: 485b1aca799e | 来源:发表于2017-06-18 14:14 被阅读0次

    爬取豆瓣id和IMDB_id

    
    #输入电影名字、导演、演员信息,爬取豆瓣id和IMDB_id
    #输入信息必须经过严格清洗,不允许出现空格,导演和演员字符长度严格小于等于6
    
    
    #### 计算程序的运行时间
    timestart<-Sys.time();
    #打印开始时间
    print(timestart)
    ####这块写你要运行的程序
    
    
    #报头设置非常重要,爬虫一定要伪装,另外for循环一定要间隔休息
    library(xlsx)
    library(readxl)
    library(plyr)
    library(sqldf)
    library(data.table)
    library(RCurl)
    library(XML)
    library(stringr)
    #伪装报头
    myheader<-c(
      "User-Agent"="Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
      "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
      "Accept-Language"="en-us",
      "Connection"="keep-alive",
      "Accept-Charset"="GB2312,utf-8;q=0.7,*;q=0.7"
    )
    
    
    ########定义编辑距离函数#############
    Fun <- function(x,y){
      library(stringr)
      
      m <- str_length(x)
      n <- str_length(y)
      
      x <- str_split(x,pattern = "")[[1]];
      y <- str_split(y,pattern = "")[[1]];
      
      M <- matrix(0,nrow = m+1,ncol = n+1);
      rownames(M) <- c(" ",x)
      colnames(M) <- c(" ",y)
      
      for(i in 1:(m+1)) M[i,1] <- i-1; 
      for(j in 1:(n+1)) M[1,j] <- j-1; 
      
      for(i in 2:(m+1)){
        for(j in 2:(n+1)){
          if(x[i-1]==y[j-1]) cost=0 else cost=1;
          M[i,j]=min(M[i-1,j]+1,M[i,j-1]+1,M[i-1,j-1]+cost)
        }
      }
      #返回字符串的相似度
      return(round(1-M[m+1,n+1]/(m+n),2));
      
    }
    
    
    
    
    
    #url <- "https://movie.douban.com/"
    
    #text=c("碟中谍","狮子王","魔戒3","星际穿越","火星救援","碟中谍2","职业特工队2","谍影重重2","碟中谍5")
    #text="哈利波特与魔法石"
    #text="少年派的奇幻漂流"
    #text="哈利波特与死亡圣器(下)"
    #text="手机"
    #text <- t(c("加勒比海盗1:黑珍珠号的诅咒","冯小刚"," 张国立葛优范"))
    #text <- t(c("哈利波特与死亡圣器(下)" ,        "大飞",   "廖智苗皓钧"))
    #输入参数
    #text <- y[1:100,c(1,2,3)]
    #i=2
    #抽样测试
    #text <- text[sample(2901,200,replace = F),]
    
    ##################测试###########################
    #text <- as.data.frame(t(z[1,]),stringsAsFactors = F)
    
    ###################匹配分类##############
    #A <- "完全匹配"
    #B <- "多个匹配但前五结果唯一"
    #C <- "返回一个结果标题不匹配但详情页匹配"
    #D <- "前五结果多个但是匹配上了"
    #E <- "完全不匹配"
    #F <- "前五结果多个但是没有匹配上"
    
    
    url <- paste("https://movie.douban.com/subject_search?search_text=",text[,1],"&cat=1002",sep = "")
    url_douban <- NULL;
    url_douban_id <- NULL;
    imdb_id <- NULL;
    class <- NULL;
    
    
    
    #输入数据英文小括号改写为正则表达式\\(和\\)
    text[,1]<- str_replace(str_replace(text[,1],pattern = "\\(",replacement = "\\\\("),pattern = "\\)",replacement = "\\\\)");
    
    #i=1
    for(i in 1:length(url)){
      wp<-getURL(url[i],.encoding="utf-8",followlocation=T,httpheader=myheader)
      doc <- htmlParse(wp,asText=T,encoding="UTF-8")#解析
      #text[i]为电影名字
      
      title<- xpathSApply(doc,"//div[@class='pl2']//a",xmlValue)#搜索页所有的结果标题  
      title <- str_replace_all(title,pattern = "·",replacement = "")
      
      
      pipei <- str_detect(title,paste(" ",text[i,1]," {0,2}(\\(.{2,3}\\))?(:.{0,8})?","[\n | /]",sep = ""))
      
      #如果匹配列表个数等于1
      if(length(which(pipei))==1){
        subscript <- which(pipei)[1]#匹配上电影的下标
        
        url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
        url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
        wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
        doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
        if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0){ imdb_id[i] <- "000"}
        else{imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
        class[i] <- "A"
      }
      else if(length(which(pipei))>1){
        if(length(which(pipei[1:5]))==1){
          subscript <- which(pipei)[1]#匹配上电影的下标
          
          url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[subscript]#进入搜索结果的链接
          url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
          wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
          doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
          
            if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))==0) imdb_id[i] <- "000"
          else {imdb_id[i] <     xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
          class[i] <- "B"
        } 
        else{
          #########需要进行二次匹配###########
          url_pipei <-xpathSApply(doc,"//div[@class='pl2']//a",xmlGetAttr,"href")[which(pipei)]; 
          
          imdb_id_temp <- NULL;
          xishu <- NULL;
          for(n in 1:length(url_pipei)){
            
            wp1<-getURL(url_pipei[n],.encoding="utf-8",followlocation=T,httpheader=myheader)
            doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")
            #导演
            daoyan <- xpathSApply(doc1,path = "//div[@id='info']//span",xmlValue)[1]
            # if(is.null(daoyan)){daoyan <- ""}
            daoyan <- str_replace_all(daoyan,pattern = "导演:","")
            daoyan <- str_replace_all(daoyan,pattern = " ","")
            daoyan <- str_replace_all(daoyan,pattern = "/","")
            if(length(daoyan)==0){daoyan <- " "}
            if(str_length(daoyan)>6)
            {daoyan <- substring(daoyan,1,6)}
            
            #主演
            zhuyan <- xpathSApply(doc1,path = "//div[@id='info']//span[@class='actor']",xmlValue)
            if(length(zhuyan)==0){zhuyan <- "abcdef"}
            zhuyan <- str_replace_all(zhuyan,pattern = "主演:","")
            zhuyan <- str_replace_all(zhuyan,pattern = " ","")
            zhuyan <- str_replace_all(zhuyan,pattern = "/","")
            
            if(str_length(zhuyan)>6) {zhuyan <- substring(zhuyan,1,6)}
            
            p <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)
            
            if(length(p)==0){imdb_id_temp[n] <-"000"}
            else{imdb_id_temp[n] <-xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
            
            #加权系数计算
            xishu[n] <- 0.6*Fun(text[i,2],daoyan)+0.4*Fun(text[i,3],zhuyan)###需要将输入参数改为三个变量的数据框
            
          }
          subscript <- which.max(xishu)
          if(xishu[subscript]>0.5)
          {
            url_douban_id[i] <- str_split(url_pipei[subscript],pattern = "/")[[1]][5]
            imdb_id[i] <- imdb_id_temp[subscript]
            class[i] <- "D"
          }
          else{
            url_douban_id[i] <- 0;
            imdb_id[i] <- 0;
            class[i] <- "F"
          }
        }
      }
      else {
        if(length(pipei)==1)
        {
          url_douban <- xpathSApply(doc,"//div[@class='pl2']/a",xmlGetAttr,"href")[1]
          wp1<-getURL(url_douban,.encoding="utf-8",followlocation=T,httpheader=myheader)
          doc1 <- htmlParse(wp1,asText=T,encoding="UTF-8")#解析
          if(length(xpathSApply(doc1,path = "//div[@id='info']",xmlValue))!=0){
          text_another_name <- str_extract(xpathSApply(doc1,path = "//div[@id='info']",xmlValue),pattern = "又名:.*IMDb链接")
          if(is.na(text_another_name)){
            text_another_name <- "aaaaaa"
          }
          }
          else{
            text_another_name <- "aaaaaa"
          }
          if(str_detect(text_another_name,text[i,1]))
          {
            url_douban_id[i] <- str_split(url_douban,pattern = "/")[[1]][5]#豆瓣的id号
           if(length(xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue))!=0){
             imdb_id[i] <- xpathSApply(doc1,path="//div[@id='info']//a[@target='_blank' and @rel='nofollow'][last()]",xmlValue)}
            else{imdb_id[i] <- "000"}
            class[i] <- "C"
          }
          else {
            url_douban_id[i] <- NA;
            imdb_id[i] <- NA;
            class[i] <- "E";
          }
        }
        else{
          url_douban_id[i] <- NA;
          imdb_id[i] <- NA;
          class[i] <- "E";
        }
      }  
      #每一次循环休息2秒左右
      Sys.sleep(2+runif(1,0,1))
    }
    
    #整理成数据框
    x <- data.frame(text[,1],url_douban_id,imdb_id,class)
    
    
    #如果匹配列表返回值前五个出现相同的匹配结果,则返回id=0;考虑将结果范围缩小到
    #如果列表返回值是1,但是不匹配名称,则获得链接,进入详情信息页面
    #对搜索列表的电影名称进行精简修改,注意英文名字需要加上分隔符,比如哈利波特、珀西杰克逊等
    #标题第二个字段好像是没有进行匹配的,需要进行修改
    
    #计算程序结束时间
    timeend<-Sys.time()
    #打印结束时间
    print(timeend)
    runningtime<-timeend-timestart
    #输出时间消耗 
    print(runningtime)
    
    

    相关文章

      网友评论

          本文标题:RCurl包爬取豆瓣电影id和IMDB电影号id

          本文链接:https://www.haomeiwen.com/subject/qyiiqxtx.html