爬取豆瓣电影top250的那个爬虫框架爬取静态网站非常好用,今天练习了一下,套用并且对细节进行了一些改进,用来爬取虎扑NBA这个静态站,其实这个比豆瓣电影的结构更简单一些。
# import library
import requests
from bs4 import BeautifulSoup
import os
# get html text 标准的请求网页框架
def fGetHtmlText(vUrl):
try:
vHeaders = {"user-agent": "Mozilla/5.0"}
r = requests.get(vUrl, headers = vHeaders)
r.raise_for_status()
r.encoding = r.apparent_encoding
return(r.text)
except:
print("There is something wrong!")
# analysis the html text 使用Beautiful Soup 4解析网页,提取需要的部分,bs4、xPath和正则是爬虫提取内容的3种主要方法,保存文件的函数套用在数据提取部分
def fSoup(vHtml):
vSoup = BeautifulSoup(vHtml, "html.parser")
# css selector
vDivNewsList = vSoup.select(".news-list li")
for vLi in vDivNewsList:
# title
vTitle = vLi.find("h4").a.text
# where the news come from
vComeFrom = vLi.find("span", class_ = "comeFrom").a.text
# new's url in HuPu
vNewsUrl = vLi.find("h4").a["href"]
# new's original url
vComeFromUrl = vLi.find("span", class_ = "comeFrom").a["href"]
# save data
fSaveData(vTitle, vComeFrom, vNewsUrl, vComeFromUrl)
# save data 套用于网页解析部分,用于保存数据
def fSaveData(title, comeFrom, newsUrl, comeFromUrl):
f = open('F:\\PythonData\\huPuTiYu\\huPuTiYu.csv', 'a')
f.write(f'{title}, {comeFrom}, {newsUrl}, {comeFromUrl}\n')
f.closed
# judgeing if there is the file and folder 判定文件夹及文件是否存在,用于主程序运行之前
def fJudgeFile():
if os.path.exists("F:\\PythonData\\huPuTiYu\\") == False:
os.makedirs("F:\\PythonData\\huPuTiYu\\")
if os.path.exists("F:\\PythonData\\huPuTiYu\\huPuTiYu.csv") == True:
os.remove("F:\\PythonData\\huPuTiYu\\huPuTiYu.csv")
# main function
def main(vUrl):
fJudgeFile()
vHtml = fGetHtmlText(vUrl)
vText = fSoup(vHtml)
# use the main function
vUrl = "https://voice.hupu.com/nba"
print("开始爬取")
main(vUrl)
print("爬取结束")
网友评论