美文网首页
requests+BeautifulSoup 实现猫眼TOP10

requests+BeautifulSoup 实现猫眼TOP10

作者: 把握_cc79 | 来源:发表于2018-07-16 16:28 被阅读0次
    import requests
    from bs4 import BeautifulSoup
    import bs4
    import pprint
    
    
    def get_html(url, headers):
        try:
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            r.encoding = 'utf-8'
            return r.text
        except:
            print("ERROR")
    
    
    def fill_list(html, ulist):
        soup = BeautifulSoup(html, 'html.parser')
        for dd in soup('dd'):
            if isinstance(dd, bs4.element.Tag):
                rank = dd.find('i').string
                name = dd.find_all('p')[0].string
                stat = "".join(dd.find_all('p')[1].string.split())
                releasetime = dd.find_all('p')[2].string
                score = dd.find_all('i')[1].string + dd.find_all('i')[2].string
                ulist.append([rank, name, stat, releasetime, score])
    
    
    if __name__ == '__main__':
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit'
                          '/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari'
                          '/537.36'
        }
        bash_url = 'http://maoyan.com/board/4'
        deep = 10
        ulist = []
        for j in range(deep):
            url = bash_url + '?offset=' + str(j * 10)
            html = get_html(url, headers)
            fill_list(html, ulist)
        pprint.pprint(ulist)  # 这里使用pprint是为了打印漂亮一点,实际上可以不用的,可以直接入库
    

    爬取效果:


    image.png

    相关文章

      网友评论

          本文标题:requests+BeautifulSoup 实现猫眼TOP10

          本文链接:https://www.haomeiwen.com/subject/gzubpftx.html