R 新手练习项目:爬取 stackoverflow 问题和答案
爬取stackoverflow Frequent Questions 页面所有问题的详细内容。
load library
library(rvest)
library(stringr)
function to get question links
getQuestionLinks = function(url){
page = read_html(url)
links = page %>% html_nodes('.summary .question-hyperlink') %>% html_attr('href')
links = paste0('https://stackoverflow.com', links)
return(links)
}
function to extract question & answer data as a list
extractQuestionAndAnswers = function(url){
page = read_html(url)
###extract question id###
id = page %>% html_node('.question') %>% html_attr('data-questionid')
#extract question vote count
question_vote_count = page %>% html_node('#question .ai-center') %>% html_text()
#extract question&answer text
question_text = page %>% html_nodes('#question .js-post-body') %>% html_text() %>% trimws()
#extract question user detail
quser_detail <- page %>% html_nodes('.pt4')
#extract question user name
temp_quser <- quser_detail %>% html_nodes('.user-details a') %>% html_text()
question_user <- ifelse(length(temp_quser)>1,temp_quser[2],temp_quser[1])
#extract question user time
temp_qtime <- quser_detail %>% html_nodes('.relativetime') %>% html_attr('title') %>% str_remove('Z')
question_time <- ifelse(length(temp_qtime)>1,temp_qtime[2],temp_qtime[1])
#extract question tags
tags = page %>% html_nodes('#question .ps-relative') %>% html_text() %>% trimws()
###extract all answers user detail###
auser_detail = page %>% html_nodes('#answers .gsy')
answer_users <- vector()
answer_times <- vector()
for(a in auser_detail){
#extract answer user name
temp_auser <- a %>% html_nodes('.user-details a') %>% html_text() %>% str_remove_all("[\r\n' ']") %>% str_remove_all('.*%')
answer_user <- ifelse(length(temp_auser)>1,temp_auser[2],temp_auser[1])
answer_users <- c(answer_users,answer_user)
#extract answer time
temp_atime <- a %>% html_nodes('.relativetime') %>% html_attr('title') %>% str_remove('Z')
answer_time <- ifelse(length(temp_atime)>1,temp_atime[2],temp_atime[1])
answer_times <- c(answer_times,answer_time)
}
#map every answers with the question id
question_id <- rep(id,length(auser_detail))
#extract answers vote count
answer_vote_count = page %>% html_nodes('#answers .fs-title') %>% html_text()
#extract accepted answer? TURE:FALSE
accepted_answer = page %>% html_nodes('.answer') %>% html_attr('itemprop') %>% str_detect('acceptedAnswer')
#extract answers text
answer_text = page %>% html_nodes('#answers .js-post-body') %>% html_text() %>% str_remove_all("[\r\n]")
question <- data.frame(id,question_vote_count,question_text,question_user,question_time,tags,url)
answer <- data.frame(question_id,answer_vote_count,accepted_answer,answer_text,answer_users,answer_times)
return (list(question=question, answers=answer))
}
get the first 2 pages
links = character()
for (page in 1:2){
url = paste0('https://stackoverflow.com/questions?tab=Frequent&page=', page)
Sys.sleep(0.2)
thisLinks = getQuestionLinks(url)
for (link in thisLinks){ if (!(link %in% links)){
links = c(links, link)
}
}
}
print(links)
define an empty data frame for storing the forum posts
questionDF = data.frame(id=numeric(), vote_count=numeric(), question_text=character(), user=character(), question_time=character(), tags=character(), url=character())
answerDF = data.frame(question_id = numeric(), vote_count = numeric(), accepted_answer = logical(), answer_text = character(), user=character(), answer_time=character())
for each thread, get the posts
count = 0
for (url in links){
Sys.sleep(0.2)
print(url)
results = extractQuestionAndAnswers(url)
questionDF = rbind(questionDF, results$question)
answerDF = rbind(answerDF, results$answers)
count = count + 1
}
view & write the result data frame
View(answerDF)
View(questionDF)
write.csv(questionDF, "questions.csv")
write.csv(answerDF, "answers.csv")
stockoverflow1.png
网友评论