美文网首页
Python 网络数据采集

Python 网络数据采集

作者: 詹徐照 | 来源:发表于2018-10-28 21:41 被阅读21次

    环境

    CharmPy,Python 3.7

    Chapter 1 获取标题

    简单版

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    html = urlopen("http://www.pythonscraping.com/pages/page1.html")
    bs = BeautifulSoup(html.read(), features="html.parser")
    print(bs.html.body.h1)
    print(bs.body.h1)
    print(bs.h1)
    

    console:

    <h1>An Interesting Title</h1>
    <h1>An Interesting Title</h1>
    <h1>An Interesting Title</h1>
    

    完整版

    from urllib.request import urlopen
    from urllib.error import HTTPError, URLError
    from bs4 import BeautifulSoup
    
    
    def getTitle(url):
        try:
            html = urlopen(url)
        except(HTTPError, URLError) as e:
            print(e)
            return None
        try:
            bsObj = BeautifulSoup(html.read(), features="html.parser")
            title = bsObj.body.h1
        except AttributeError as e:
            print(e)
            return None
        return title
    
    
    title = getTitle("http://www.pythonscraping.com/pages/page1.html")
    print(title)
    

    Chapter 2

    2.1 findAll(),获取特定标签的所有内容

    from urllib.request import urlopen
    from bs4 import BeautifulSoup
    
    html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
    bsObj = BeautifulSoup(html.read(), features="html.parser")
    nameList = bsObj.findAll("span", {"class": "green"})
    for name in nameList:
        print(name.get_text())
        
    
    bsObj.findAll("span", {"class":{"green","red"}})
    bsObj.findAll({"h1","h2","h3"})
    
    # 下面两句等价
    bsObj.findAll(id="text")
    bsObj.findAll("",{"id":"text"})
    

    2.2 正则

    获取特定路径规则的图片

    import re
    from urllib.request import urlopen
    
    from bs4 import BeautifulSoup
    
    html = urlopen("http://www.pythonscraping.com/pages/page3.html")
    bsObj = BeautifulSoup(html.read(), features="html.parser")
    # 匹配以../img/gifts/img开头,以.jpg结尾的图片
    images = bsObj.findAll("img", {"src": re.compile("\.\.\/img\/gifts\/img.*\.jpg")})
    for image in images:
        print(image["src"])
    

    console:

    ../img/gifts/img1.jpg
    ../img/gifts/img2.jpg
    ../img/gifts/img3.jpg
    ../img/gifts/img4.jpg
    ../img/gifts/img6.jpg
    

    Chapter 3 批量采集

    3.1 遍历单个域名

    获取一个页面所有连接

    from urllib.request import urlopen
    
    from bs4 import BeautifulSoup
    
    html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
    bsObj = BeautifulSoup(html.read(), features="html.parser")
    links = bsObj.findAll("a")
    for link in links:
        if "href" in link.attrs:
            print(link.attrs['href'])
    
    # 优化过滤条件
    links = bsObj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
    
    

    迭代遍历页面的link

    from urllib.request import urlopen
    
    from bs4 import BeautifulSoup
    import re
    import datetime
    import random
    
    
    def getLinks(articalUrl):
        html = urlopen("http://en.wikipedia.org" + articalUrl)
        bsObj = BeautifulSoup(html.read(), features="html.parser")
        return bsObj.find("div", {"id": "bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
    
    
    random.seed(datetime.datetime.now())
    links = getLinks("/wiki/Kevin_Bacon")
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links) - 1)].attrs["href"]
        print(newArticle)
        links = getLinks(newArticle)
    
    

    相关文章

      网友评论

          本文标题:Python 网络数据采集

          本文链接:https://www.haomeiwen.com/subject/rtuftqtx.html