静态站虎扑NBA新闻爬取

作者: 波波在敲代码 | 来源:发表于2019-09-17 17:43 被阅读0次

静态站虎扑NBA新闻爬取
爬虫实战练习 - 3 - 爬取虎扑新闻
2019-03-13
实战｜手把手教你用Python爬取存储数据，还能自动在Excel
热点平台搭建（一）——Python爬取热榜数据
jsoup爬NBA图
虎扑步行街爬虫分析
Python: 实验爬取虎扑篮球图片
python爬虫
我不是保罗乔治，但我想拥有他一次次用绝杀拯救球队的能力

爬取豆瓣电影top250的那个爬虫框架爬取静态网站非常好用，今天练习了一下，套用并且对细节进行了一些改进，用来爬取虎扑NBA这个静态站，其实这个比豆瓣电影的结构更简单一些。

# import library
import requests
from bs4 import BeautifulSoup
import os

# get html text 标准的请求网页框架
def fGetHtmlText(vUrl):
    try:
        vHeaders = {"user-agent": "Mozilla/5.0"}
        r = requests.get(vUrl, headers = vHeaders)
        r.raise_for_status()
        r.encoding = r.apparent_encoding        
        return(r.text)
    except:
        print("There is something wrong!")

# analysis the html text 使用Beautiful Soup 4解析网页，提取需要的部分，bs4、xPath和正则是爬虫提取内容的3种主要方法，保存文件的函数套用在数据提取部分
def fSoup(vHtml):
    vSoup = BeautifulSoup(vHtml, "html.parser")
    # css selector
    vDivNewsList = vSoup.select(".news-list li")
    for vLi in vDivNewsList:
        # title
        vTitle = vLi.find("h4").a.text
        # where the news come from
        vComeFrom = vLi.find("span", class_ = "comeFrom").a.text
        # new's url in HuPu
        vNewsUrl = vLi.find("h4").a["href"]
        # new's original url
        vComeFromUrl = vLi.find("span", class_ = "comeFrom").a["href"]
        # save data
        fSaveData(vTitle, vComeFrom, vNewsUrl, vComeFromUrl)

# save data 套用于网页解析部分，用于保存数据
def fSaveData(title, comeFrom, newsUrl, comeFromUrl):
    f = open('F:\\PythonData\\huPuTiYu\\huPuTiYu.csv', 'a')
    f.write(f'{title}, {comeFrom}, {newsUrl}, {comeFromUrl}\n')
    f.closed 

# judgeing if there is the file and folder 判定文件夹及文件是否存在，用于主程序运行之前
def fJudgeFile():
    if os.path.exists("F:\\PythonData\\huPuTiYu\\") == False:
        os.makedirs("F:\\PythonData\\huPuTiYu\\")
    if os.path.exists("F:\\PythonData\\huPuTiYu\\huPuTiYu.csv") == True:
        os.remove("F:\\PythonData\\huPuTiYu\\huPuTiYu.csv")  

# main function
def main(vUrl):
    fJudgeFile()
    vHtml = fGetHtmlText(vUrl)
    vText = fSoup(vHtml)  

# use the main function
vUrl = "https://voice.hupu.com/nba"
print("开始爬取")
main(vUrl)
print("爬取结束")