美文网首页
Python批量爬取百度贴吧数据

Python批量爬取百度贴吧数据

作者: 黑猫编程 | 来源:发表于2019-10-03 19:24 被阅读0次

    分析百度贴吧url

    kw=python作为字典传入,pn=0为第一页,第二页pn=50,第二页pn=100

    先构造出前10页url

    # -*- coding: utf-8 -*-
    # @Time    : 2019/10/3 18:56
    # @Author  : 币行者
    # @Email   : xypip@qq.com
    # @File    : test5.py
    
    import requests
    
    headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
    
    url_temp = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
    url_list = [url_temp.format("python", i * 50) for i in range(10)]
    
    print(url_list)
    

    将获取到10页数据全部保存至本地

    # -*- coding: utf-8 -*-
    # @Time    : 2019/10/3 18:56
    # @Author  : 币行者
    # @Email   : xypip@qq.com
    # @File    : test5.py
    
    import requests
    
    headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
    
    url_temp = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
    url_list = [url_temp.format("python", i * 50) for i in range(10)]
    
    # print(url_list)
    
    for url in url_list:
        response = requests.get(url, headers=headers)
        html_str = response.content.decode()
        page_num = url_list.index(url) + 1
        file_path = "{}—第{}页.html".format("python", page_num)
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(html_str)
    

    面向对象方法

    # -*- coding: utf-8 -*-
    # @Time    : 2019/10/3 18:37
    # @Author  : 币行者
    # @Email   : xypip@qq.com
    # @File    : baidutieba_spider.py
    
    import requests
    
    class TiebaSpider:
    
        def __init__(self, tieba_name, tieba_num):
            self.tieba_name = tieba_name
            self.tieba_num = tieba_num
            self.url_temp = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
            self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"}
    
        def get_url_list(self):
            return [self.url_temp.format(i * 50) for i in range(self.tieba_num)]
    
        def parse_url(self, url):
            print(url)
            response = requests.get(url, headers=self.headers)
            return response.content.decode()
    
        def save_html(self, html_str, page_num):
            file_path = "{}—第{}页.html".format(self.tieba_name, page_num)
            with open(file_path, "w", encoding="utf-8") as f:
                f.write(html_str)
    
        def run(self):
            url_list = self.get_url_list()
            for url in url_list:
                html_str = self.parse_url(url)
                page_num = url_list.index(url) + 1  # 页码数
                self.save_html(html_str, page_num)
    
    
    
    if __name__ == '__main__':
    
        tieba_spider = TiebaSpider("python", 10)
        tieba_spider.run()
    

    相关文章

      网友评论

          本文标题:Python批量爬取百度贴吧数据

          本文链接:https://www.haomeiwen.com/subject/vradpctx.html