美文网首页
py爬虫13:练习之抓取链接二手房信息

py爬虫13:练习之抓取链接二手房信息

作者: _百草_ | 来源:发表于2022-08-01 11:49 被阅读0次
    1. 分析url
    https://bj.lianjia.com/ershoufang/
    https://bj.lianjia.com/ershoufang/pg2/
    https://bj.lianjia.com/ershoufang/pg3/
    ...
    # 第一页,https://bj.lianjia.com/ershoufang/pg1/也可以重定向到https://bj.lianjia.com/ershoufang/
    
    1. 确定爬取元素xpath
    # 分析元素
    """
    <div class="title"><a class="" href="https://" target="_blank" data-log_index="5" data-el="ershoufang" data-housecode="101115961312" data-is_focus="" data-sl="">新通国际南北通透两居室 双落地阳台 精装修 诚心出售</a><span class="goodhouse_tag tagBlock">必看好房</span></div>
    <div class="positionInfo"><span class="positionIcon"></span><a href="https://" target="_blank" data-log_index="5" data-el="region">新通国际花园 </a>   -  <a href="https://" target="_blank">梨园</a> </div>
    <div class="houseInfo"><span class="houseIcon"></span>2室1厅 | 94.05平米 | 南 北 | 其他 | 低楼层(共28层)  | 板塔结合</div>
    <div class="priceInfo"><div class="totalPrice totalPrice2"><i> </i><span class="">465</span><i>万</i></div><div class="unitPrice" data-hid="101115961312" data-rid="1111027381229" data-price="49442"><span>49,442元/平</span></div></div>
    """
    
    .//div[@class='info clear']  # 单个房屋所有信息,注意1 .不可缺少 2 多类名时均需要写入
    # 再从单个房屋所有信息中获取单一信息
    .//a[@data-housecode]/text()  # 房屋标题
    .//div[@class='positionInfo']//text()  # 房屋位置
    .//div[@class='houseInfo']/text()  # 房屋基本信息,如3房2厅等
     .//div[@class='totalPrice totalPrice2']//text()  # 房屋总价
    .//div[@class='unitPrice']//text()  # 房屋每平
    
    1. 提升效率
      减少网络波动的影响,添加重试机制
    2. 其他
      写入csv表头时,若封装在save_file(),存在重复写入的情况,故写入表头放置run()

    代码:

    # -*- coding:utf-8 -*-
    """
    @author:百草Lily
    @file:test_lianjia.py
    @time:2022/8/1
    """
    from faker import Faker
    from urllib.request import Request, urlopen
    from os.path import join, dirname
    # from urllib import parse
    import time
    import random
    from lxml import etree
    import csv
    
    
    class LianJia:
        """
        链家二手房数据爬取
        """
    
        def __init__(self):
            self.url = "https://bj.lianjia.com/ershoufang/{}"
            self.tabhead = ["house", "positionInfo", "houseInfo", "totalPrice", "unitPrice"]
            self.retry = 0  # 重试次数
    
        # @staticmethod
        def get_req(self, url):
            """
            发起请求
            :param url:
            :return:
            """
            # 为了提高效率,减少网络波动的影响,添加重试机制;且添加超时时间
            if self.retry < 3:
                # 请求超时,失败后重试3次
                try:
                    ua = Faker(local="zh_CN").user_agent()
                    req = Request(url, headers={"User-Agent": ua})  # 缺少代理服务器
                    resp = urlopen(req, timeout=1).read().decode("utf-8")  # 获取返回信息,字节类型
                    return resp
                except Exception as e:
                    print(f"请求失败:{e}")
                    self.retry += 1
                    return self.get_req(url)
    
        # @staticmethod
        def parse_html(self, html):
            """
            解析html
            :param html:
            :return:
            """
            parse_html = etree.HTML(html)
            divs = parse_html.xpath(".//div[@class='info clear']")  # 少了.表示当前路径;2.多类名需要均写入
            info = []
            for ele in divs:
                house = ele.xpath(".//a[@data-housecode]/text()")
                positionInfo = ''.join(ele.xpath(".//div[@class='positionInfo']//text()"))
                houseInfo = ele.xpath(".//div[@class='houseInfo']/text()")
                # 3室2厅 | 199.38平米 | 西南 | 精装 | 中楼层(共22层) | 2011年建 | 塔楼;多条信息,可以考虑分列存储
                # step1:x1,x2,x3,x4= s.split(" | ")  # 拆包
                # step2:self.tabhead中添加对应的字段名
                # step3:run() 中写入表头时,添加对应字段即可
                totalPrice = ''.join(ele.xpath(".//div[@class='totalPrice totalPrice2']//text()"))  # 总计
                unitPrice = ele.xpath(".//div[@class='unitPrice']//text()")  # 单价
                info.append(dict(zip(self.tabhead,
                                     [house[0], positionInfo, houseInfo[0], totalPrice, unitPrice[0]])))
            return info
    
        # @staticmethod
        def save_file(self, filename, data):
            """
            保存文件
           :param filename:
            :param data:
            :return:
            """
            # with open(filename, "w", encoding="utf-8") as f:
            #     f.write(data)
    
            # for i in data:
            #     # print(i)
            #     with open(filename, "a", encoding="utf-8") as f:
            #         f.write(str(i))
            #         # TypeError: write() argument must be str, not dict
    
            with open(filename, "a", encoding="utf-8", newline="") as f:
                writer = csv.DictWriter(f, self.tabhead)
                # writer.writeheader()
                # writer.writerow(dict(zip(self.tabhead, ["房屋", "位置", "房屋信息", "总价", "单价"])))  # 写入表头;bug 再次请求,重复添加
                writer.writerows(data)
    
        def run(self, pg):
            """
            执行函数
            :param pg:
            :return:
            """
            # 表头添加,修复多页时,表头重复
            filename = join(dirname(__file__), f"链家前{pg}页_{time.strftime('%Y%m%d%H%M%S')}.csv")
            with open(filename, "w", encoding="utf-8", newline="") as f:
                writer = csv.DictWriter(f, self.tabhead)
                writer.writerow(dict(zip(self.tabhead, ["房屋", "位置", "房屋信息", "总价", "单价"])))  # 写入表头;bug 再次请求,重复添加
    
            for i in range(pg):
                url = self.url.format(f"pg{i + 1}")  # url
                resp = self.get_req(url)
                res = self.parse_html(resp)
                self.save_file(filename, res)
                time.sleep(random.uniform(2.0, 3.0))  # 等待
    
    
    if __name__ == "__main__":
        page = 2
        LianJia().run(page)
    
    

    相关文章

      网友评论

          本文标题:py爬虫13:练习之抓取链接二手房信息

          本文链接:https://www.haomeiwen.com/subject/djcrwrtx.html