美文网首页
对自己的bilibili收藏夹中的生信视频进行探索分析

对自己的bilibili收藏夹中的生信视频进行探索分析

作者: 一只烟酒僧 | 来源:发表于2020-03-24 15:48 被阅读0次
    ######################################################## 
    #-------------------------------------------------------
    # Topic:对我自己的bilibili收藏夹做探索和分析总结
    # Author:
    # Date:Mon Mar 23 21:18:57 2020
    # Mail:
    #-------------------------------------------------------
    ########################################################
    
    #-------------------------------------------------------
    # 
    # Chapter1:爬取收藏夹中所有生信视频的标签,并对标签做统计,找到出现次数最多的前二十名标签,并从中找到感兴趣的词条
    # 
    #-------------------------------------------------------
    #-------------------------------------------------------
    #Function1:获得收藏夹信息(含url)
    #-------------------------------------------------------
    
    options(encoding = "native.enc")#注意,此处如果设置成utf-8会导致乱码!!!options默认的编码方式是native.enc
    library(RSelenium)
    remDr<-remoteDriver(browser="chrome",remoteServerAddr="localhost",port=4444L)
    remDr$open()
    url<-"https://space.bilibili.com/65714723/favlist?fid=291229523&ftype=create" #收藏夹网址
    remDr$navigate(url)
    myswitch <- function (remDr, windowId) 
    {
      qpath <- sprintf("%s/session/%s/window", remDr$serverURL, 
                       remDr$sessionInfo[["id"]])
      remDr$queryRD(qpath, "POST", qdata = list(handle = windowId))
    }
    myswitch(remDr,remDr$getWindowHandles()[[2]])
    library(rvest)
    title<-'//a[@class="title"]'
    url<-'//a[contains(@class,"normal")]'  #href
    length<-'//span[@class="length"]'
    bofang<-'//p[@class="view"]'
    shouchang<-'//p[@class="favorite"]'
    author<-'//p[@class="author"]'
    tougao<-'//p[@class="pubdate"]'
    woshoucangyu<-'//div[contains(@class,"pubdate")]'
    infolist<-list(title='//a[@class="title"]',
                   length='//span[@class="length"]',
                   bofang='//p[@class="view"]',
                   shouchang='//p[@class="favorite"]',
                   author='//p[@class="author"]',
                   tougao='//p[@class="pubdate"]',
                   woshoucangyu='//div[contains(@class,"pubdate")]')
    page<-read_html(remDr$getPageSource()[[1]])
    info<-sapply(infolist,function(x){a=html_nodes(page,xpath = x)%>%html_text(trim = T);return(a)})
    info<-data.frame(info,stringsAsFactors = F)
    colnames(info)<-names(infolist)
    info$url<-html_nodes(page,xpath = url)%>%html_attr(name = "href")
    info$url<-paste("http:",info$url,sep = "")
    for (i in 2:13) {  #收藏夹一共十三页
     click<-remDr$findElement(using = "xpath",value = '//span/input[@type="text"]') 
     click$clearElement()
     click$sendKeysToElement(list(as.character(i),key="enter"))
     page<-read_html(remDr$getPageSource()[[1]])
     info_sub<-sapply(infolist,function(x){a=html_nodes(page,xpath = x)%>%html_text(trim = T);return(a)})
     info_sub<-data.frame(info_sub,stringsAsFactors = F)
     colnames(info_sub)<-names(infolist)
     info_sub$url<-html_nodes(page,xpath = url)%>%html_attr(name = "href")
     info_sub$url<-paste("http:",info_sub$url,sep = "")
     info<-rbind(info,info_sub)
    }
    write.csv(info,"info.csv")
    #-------------------------------------------------------
    #Function2:获得标签信息
    #-------------------------------------------------------
    info$tag<-NA
    page_inner_info<-c()
    tag_url_all<-c()
    for (i in 1:dim(info)[1]) {
      if(info$url[i]!="http:javascript:;"){
      remDr$navigate(info$url[i])
      tag_dir<-'//ul[contains(@class,"tag-area")]/li/a'
      page_inner<-read_html(remDr$getPageSource()[[1]])
      page_inner_info_sub<-html_nodes(page_inner,xpath = tag_dir)%>%html_text()
      info$tag[i]<-paste(page_inner_info_sub,collapse = ";")
      page_inner_info<-c(page_inner_info,page_inner_info_sub)
      #获取标签对应的网址,注意,此时鼠标不要在页面上停留!
      tag_url_dir<-'//ul[contains(@class,"tag-area")]/li/div//a[contains(@href,"bilibili")]'
      tag_url_ele1<-remDr$findElements(using = "xpath",tag_dir)
      Sys.sleep(1)
      for (j in 1:length(tag_url_ele1)) {
        remDr$mouseMoveToLocation(webElement = tag_url_ele1[[j]])
        Sys.sleep(2)
        tag_url<-read_html(remDr$getPageSource()[[1]])%>%html_node(xpath = tag_url_dir)%>%html_attr(name = "href")
        tag_url<-paste("http:",tag_url,sep = "")
        tag_url_all<-c(tag_url_all,tag_url)
      }
    
      
      
      
      }else{next()}
    }
    page_inner_info_data<-data.frame(page_inner_info=page_inner_info,tag_url=tag_url_all)
    
    # write.csv(info,"info.csv")
    # write.csv(page_inner_info_data,"page_inner_info_data.csv")
    #-------------------------------------------------------
    #Function3:统计标签的出现次数
    #-------------------------------------------------------
    library(jiebaR)
    library(wordcloud2)
    page_inner_info_freq<-freq(page_inner_info)
    page_inner_info_freq<-page_inner_info_freq[order(page_inner_info_freq$freq,decreasing = T),]
    wordcloud2(page_inner_info_freq)
    page_inner_info_freq[1:20,]
    #-------------------------------------------------------
    # 
    # Chapter2:对标签下的视频信息进行批量爬取
    # 
    #-------------------------------------------------------
    library(stringr)
    key_word<-c("R语言","生物信息学","生物信息","数据挖掘","机器学习","linux","数据分析","人工智能","PYTHON")
    page_inner_info_data_sub<-page_inner_info_data[page_inner_info_data$page_inner_info%in%key_word & page_inner_info_data$tag_url!="http:NA",]
    page_inner_info_data_sub<-page_inner_info_data_sub[!duplicated(page_inner_info_data_sub$page_inner_info),]
    top_url<-as.character(page_inner_info_data_sub$tag_url)
    #获得的信息如下:1、视频名称
    #                2、up主
    #                3、视频简介
    #                4、官方标签
    #                5、播放量
    #                6、弹幕数目
    #                7、网址
    #                8、视频类型(标签名)
    #####################################必须点进去才能看到的信息
    #                9、发布时间
    #                10、视频时长
    for (i in 1:length(top_url)) {
    
      remDr$navigate(top_url[i])
      shipin<-remDr$findElement(using = "xpath",value = '//*[@id="app"]/div[3]/div[2]/div[2]')
      remDr$mouseMoveToLocation(webElement = shipin)
      remDr$click()
      page_topic1<-read_html(remDr$getPageSource()[[1]])
      total_page<-html_node(page_topic,xpath = '//*[@id="app"]/div[3]/div[3]/div/div/div[2]/div/div/div[1]/span[1]')%>%html_text()%>%str_extract(pattern = "[0-9]{1,}")
      info_topic_sub_all<-data.frame()
      for (j in 1:as.numeric(total_page)) {
       input_num<-remDr$findElement(using = "xpath",'//input[@class="page-input"]')
       input_num$clearElement()
       input_num$sendKeysToElement(list(as.character(j),key="enter"))
       Sys.sleep(3)
       page_topic<-read_html(remDr$getPageSource()[[1]])
       #获取1-7列信息
       info_most_topic<-'//li[@class="content-item"]'  ##包含2,3,4,5,6
       info_other_topic<-'//li/div/a[2]' #包含1,7
       info_most_topic_info<-sapply(c("data-upmaster","data-danmu","data-play","data-text","data-tagname"),function(x){a=html_nodes(page_topic,xpath = info_most_topic)%>%html_attr(name=x);return(a)})
       info_other_topic_info<-sapply(c("title","href"),function(x){a=html_nodes(page_topic,xpath = info_other_topic)%>%html_attr(name=x);return(a)})
       info_topic_sub<-cbind(as.data.frame(info_most_topic_info),as.data.frame(info_other_topic_info))
       info_topic_sub$tag<-page_inner_info_data_sub$page_inner_info[i] 
       info_topic_sub<-data.frame(Title=info_topic_sub$title,
                                  up=info_topic_sub$`data-upmaster`,
                                  Introduce=info_topic_sub$`data-text`,
                                  Official_tag=info_topic_sub$`data-tagname`,
                                  Play_num=info_topic_sub$`data-play`,
                                  Danmu_num=info_topic_sub$`data-danmu`,
                                  Url=info_topic_sub$href,
                                  Topic_name=info_topic_sub$tag
                                  )
       info_topic_sub$Url<-paste("http:",info_topic_sub$Url,sep = "")
       info_topic_sub_all<-rbind(info_topic_sub_all,info_topic_sub)
       }
      
       write.xlsx(info_topic_sub_all,"info_topic_all.xlsx",sheetName = page_inner_info_data_sub$page_inner_info[i],append = T,row.names = F)
    
      
    
    }
    
    #-------------------------------------------------------
    # 
    # Chapter3:对所有视频的子标签再做探索,找到该类视频中出现次数最多的标签(套娃)
    # 
    #-------------------------------------------------------
    
    

    相关文章

      网友评论

          本文标题:对自己的bilibili收藏夹中的生信视频进行探索分析

          本文链接:https://www.haomeiwen.com/subject/xqnoyhtx.html