########################################################
#-------------------------------------------------------
# Topic:对我自己的bilibili收藏夹做探索和分析总结
# Author:
# Date:Mon Mar 23 21:18:57 2020
# Mail:
#-------------------------------------------------------
########################################################
#-------------------------------------------------------
#
# Chapter1:爬取收藏夹中所有生信视频的标签,并对标签做统计,找到出现次数最多的前二十名标签,并从中找到感兴趣的词条
#
#-------------------------------------------------------
#-------------------------------------------------------
#Function1:获得收藏夹信息(含url)
#-------------------------------------------------------
options(encoding = "native.enc")#注意,此处如果设置成utf-8会导致乱码!!!options默认的编码方式是native.enc
library(RSelenium)
remDr<-remoteDriver(browser="chrome",remoteServerAddr="localhost",port=4444L)
remDr$open()
url<-"https://space.bilibili.com/65714723/favlist?fid=291229523&ftype=create" #收藏夹网址
remDr$navigate(url)
myswitch <- function (remDr, windowId)
{
qpath <- sprintf("%s/session/%s/window", remDr$serverURL,
remDr$sessionInfo[["id"]])
remDr$queryRD(qpath, "POST", qdata = list(handle = windowId))
}
myswitch(remDr,remDr$getWindowHandles()[[2]])
library(rvest)
title<-'//a[@class="title"]'
url<-'//a[contains(@class,"normal")]' #href
length<-'//span[@class="length"]'
bofang<-'//p[@class="view"]'
shouchang<-'//p[@class="favorite"]'
author<-'//p[@class="author"]'
tougao<-'//p[@class="pubdate"]'
woshoucangyu<-'//div[contains(@class,"pubdate")]'
infolist<-list(title='//a[@class="title"]',
length='//span[@class="length"]',
bofang='//p[@class="view"]',
shouchang='//p[@class="favorite"]',
author='//p[@class="author"]',
tougao='//p[@class="pubdate"]',
woshoucangyu='//div[contains(@class,"pubdate")]')
page<-read_html(remDr$getPageSource()[[1]])
info<-sapply(infolist,function(x){a=html_nodes(page,xpath = x)%>%html_text(trim = T);return(a)})
info<-data.frame(info,stringsAsFactors = F)
colnames(info)<-names(infolist)
info$url<-html_nodes(page,xpath = url)%>%html_attr(name = "href")
info$url<-paste("http:",info$url,sep = "")
for (i in 2:13) { #收藏夹一共十三页
click<-remDr$findElement(using = "xpath",value = '//span/input[@type="text"]')
click$clearElement()
click$sendKeysToElement(list(as.character(i),key="enter"))
page<-read_html(remDr$getPageSource()[[1]])
info_sub<-sapply(infolist,function(x){a=html_nodes(page,xpath = x)%>%html_text(trim = T);return(a)})
info_sub<-data.frame(info_sub,stringsAsFactors = F)
colnames(info_sub)<-names(infolist)
info_sub$url<-html_nodes(page,xpath = url)%>%html_attr(name = "href")
info_sub$url<-paste("http:",info_sub$url,sep = "")
info<-rbind(info,info_sub)
}
write.csv(info,"info.csv")
#-------------------------------------------------------
#Function2:获得标签信息
#-------------------------------------------------------
info$tag<-NA
page_inner_info<-c()
tag_url_all<-c()
for (i in 1:dim(info)[1]) {
if(info$url[i]!="http:javascript:;"){
remDr$navigate(info$url[i])
tag_dir<-'//ul[contains(@class,"tag-area")]/li/a'
page_inner<-read_html(remDr$getPageSource()[[1]])
page_inner_info_sub<-html_nodes(page_inner,xpath = tag_dir)%>%html_text()
info$tag[i]<-paste(page_inner_info_sub,collapse = ";")
page_inner_info<-c(page_inner_info,page_inner_info_sub)
#获取标签对应的网址,注意,此时鼠标不要在页面上停留!
tag_url_dir<-'//ul[contains(@class,"tag-area")]/li/div//a[contains(@href,"bilibili")]'
tag_url_ele1<-remDr$findElements(using = "xpath",tag_dir)
Sys.sleep(1)
for (j in 1:length(tag_url_ele1)) {
remDr$mouseMoveToLocation(webElement = tag_url_ele1[[j]])
Sys.sleep(2)
tag_url<-read_html(remDr$getPageSource()[[1]])%>%html_node(xpath = tag_url_dir)%>%html_attr(name = "href")
tag_url<-paste("http:",tag_url,sep = "")
tag_url_all<-c(tag_url_all,tag_url)
}
}else{next()}
}
page_inner_info_data<-data.frame(page_inner_info=page_inner_info,tag_url=tag_url_all)
# write.csv(info,"info.csv")
# write.csv(page_inner_info_data,"page_inner_info_data.csv")
#-------------------------------------------------------
#Function3:统计标签的出现次数
#-------------------------------------------------------
library(jiebaR)
library(wordcloud2)
page_inner_info_freq<-freq(page_inner_info)
page_inner_info_freq<-page_inner_info_freq[order(page_inner_info_freq$freq,decreasing = T),]
wordcloud2(page_inner_info_freq)
page_inner_info_freq[1:20,]
#-------------------------------------------------------
#
# Chapter2:对标签下的视频信息进行批量爬取
#
#-------------------------------------------------------
library(stringr)
key_word<-c("R语言","生物信息学","生物信息","数据挖掘","机器学习","linux","数据分析","人工智能","PYTHON")
page_inner_info_data_sub<-page_inner_info_data[page_inner_info_data$page_inner_info%in%key_word & page_inner_info_data$tag_url!="http:NA",]
page_inner_info_data_sub<-page_inner_info_data_sub[!duplicated(page_inner_info_data_sub$page_inner_info),]
top_url<-as.character(page_inner_info_data_sub$tag_url)
#获得的信息如下:1、视频名称
# 2、up主
# 3、视频简介
# 4、官方标签
# 5、播放量
# 6、弹幕数目
# 7、网址
# 8、视频类型(标签名)
#####################################必须点进去才能看到的信息
# 9、发布时间
# 10、视频时长
for (i in 1:length(top_url)) {
remDr$navigate(top_url[i])
shipin<-remDr$findElement(using = "xpath",value = '//*[@id="app"]/div[3]/div[2]/div[2]')
remDr$mouseMoveToLocation(webElement = shipin)
remDr$click()
page_topic1<-read_html(remDr$getPageSource()[[1]])
total_page<-html_node(page_topic,xpath = '//*[@id="app"]/div[3]/div[3]/div/div/div[2]/div/div/div[1]/span[1]')%>%html_text()%>%str_extract(pattern = "[0-9]{1,}")
info_topic_sub_all<-data.frame()
for (j in 1:as.numeric(total_page)) {
input_num<-remDr$findElement(using = "xpath",'//input[@class="page-input"]')
input_num$clearElement()
input_num$sendKeysToElement(list(as.character(j),key="enter"))
Sys.sleep(3)
page_topic<-read_html(remDr$getPageSource()[[1]])
#获取1-7列信息
info_most_topic<-'//li[@class="content-item"]' ##包含2,3,4,5,6
info_other_topic<-'//li/div/a[2]' #包含1,7
info_most_topic_info<-sapply(c("data-upmaster","data-danmu","data-play","data-text","data-tagname"),function(x){a=html_nodes(page_topic,xpath = info_most_topic)%>%html_attr(name=x);return(a)})
info_other_topic_info<-sapply(c("title","href"),function(x){a=html_nodes(page_topic,xpath = info_other_topic)%>%html_attr(name=x);return(a)})
info_topic_sub<-cbind(as.data.frame(info_most_topic_info),as.data.frame(info_other_topic_info))
info_topic_sub$tag<-page_inner_info_data_sub$page_inner_info[i]
info_topic_sub<-data.frame(Title=info_topic_sub$title,
up=info_topic_sub$`data-upmaster`,
Introduce=info_topic_sub$`data-text`,
Official_tag=info_topic_sub$`data-tagname`,
Play_num=info_topic_sub$`data-play`,
Danmu_num=info_topic_sub$`data-danmu`,
Url=info_topic_sub$href,
Topic_name=info_topic_sub$tag
)
info_topic_sub$Url<-paste("http:",info_topic_sub$Url,sep = "")
info_topic_sub_all<-rbind(info_topic_sub_all,info_topic_sub)
}
write.xlsx(info_topic_sub_all,"info_topic_all.xlsx",sheetName = page_inner_info_data_sub$page_inner_info[i],append = T,row.names = F)
}
#-------------------------------------------------------
#
# Chapter3:对所有视频的子标签再做探索,找到该类视频中出现次数最多的标签(套娃)
#
#-------------------------------------------------------
网友评论