py爬虫13：练习之抓取链接二手房信息

作者: _百草_ | 来源:发表于2022-08-01 11:49 被阅读0次

py爬虫13：练习之抓取链接二手房信息
爬虫介绍
【Python编程】---Python爬虫入门基础学习专题（一）
R爬虫实践—抓取国自然基金信息【下篇】
第13章实战：原生爬虫
贴吧帖子内图片抓取
Python爬虫入门--了解爬虫---什么是爬虫？
Python入门与进阶（13章）
链家杭州二手房的数据情况分析（Python爬虫）
爬虫实战

分析url

https://bj.lianjia.com/ershoufang/
https://bj.lianjia.com/ershoufang/pg2/
https://bj.lianjia.com/ershoufang/pg3/
...
# 第一页，https://bj.lianjia.com/ershoufang/pg1/也可以重定向到https://bj.lianjia.com/ershoufang/

确定爬取元素xpath

# 分析元素
"""
<div class="title"><a class="" href="https://" target="_blank" data-log_index="5" data-el="ershoufang" data-housecode="101115961312" data-is_focus="" data-sl="">新通国际南北通透两居室 双落地阳台 精装修 诚心出售</a><span class="goodhouse_tag tagBlock">必看好房</span></div>
<div class="positionInfo"><span class="positionIcon"></span><a href="https://" target="_blank" data-log_index="5" data-el="region">新通国际花园 </a>   -  <a href="https://" target="_blank">梨园</a> </div>
<div class="houseInfo"><span class="houseIcon"></span>2室1厅 | 94.05平米 | 南 北 | 其他 | 低楼层(共28层)  | 板塔结合</div>
<div class="priceInfo"><div class="totalPrice totalPrice2"><i> </i><span class="">465</span><i>万</i></div><div class="unitPrice" data-hid="101115961312" data-rid="1111027381229" data-price="49442"><span>49,442元/平</span></div></div>
"""

.//div[@class='info clear']  # 单个房屋所有信息，注意1 .不可缺少 2 多类名时均需要写入
# 再从单个房屋所有信息中获取单一信息
.//a[@data-housecode]/text()  # 房屋标题
.//div[@class='positionInfo']//text()  # 房屋位置
.//div[@class='houseInfo']/text()  # 房屋基本信息，如3房2厅等
 .//div[@class='totalPrice totalPrice2']//text()  # 房屋总价
.//div[@class='unitPrice']//text()  # 房屋每平

提升效率
减少网络波动的影响，添加重试机制
其他
写入csv表头时，若封装在save_file(),存在重复写入的情况，故写入表头放置run()

代码：

# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_lianjia.py
@time:2022/8/1
"""
from faker import Faker
from urllib.request import Request, urlopen
from os.path import join, dirname
# from urllib import parse
import time
import random
from lxml import etree
import csv


class LianJia:
    """
    链家二手房数据爬取
    """

    def __init__(self):
        self.url = "https://bj.lianjia.com/ershoufang/{}"
        self.tabhead = ["house", "positionInfo", "houseInfo", "totalPrice", "unitPrice"]
        self.retry = 0  # 重试次数

    # @staticmethod
    def get_req(self, url):
        """
        发起请求
        :param url:
        :return:
        """
        # 为了提高效率，减少网络波动的影响，添加重试机制;且添加超时时间
        if self.retry < 3:
            # 请求超时，失败后重试3次
            try:
                ua = Faker(local="zh_CN").user_agent()
                req = Request(url, headers={"User-Agent": ua})  # 缺少代理服务器
                resp = urlopen(req, timeout=1).read().decode("utf-8")  # 获取返回信息,字节类型
                return resp
            except Exception as e:
                print(f"请求失败：{e}")
                self.retry += 1
                return self.get_req(url)

    # @staticmethod
    def parse_html(self, html):
        """
        解析html
        :param html:
        :return:
        """
        parse_html = etree.HTML(html)
        divs = parse_html.xpath(".//div[@class='info clear']")  # 少了.表示当前路径；2.多类名需要均写入
        info = []
        for ele in divs:
            house = ele.xpath(".//a[@data-housecode]/text()")
            positionInfo = ''.join(ele.xpath(".//div[@class='positionInfo']//text()"))
            houseInfo = ele.xpath(".//div[@class='houseInfo']/text()")
            # 3室2厅 | 199.38平米 | 西南 | 精装 | 中楼层(共22层) | 2011年建 | 塔楼；多条信息，可以考虑分列存储
            # step1:x1,x2,x3,x4= s.split(" | ")  # 拆包
            # step2:self.tabhead中添加对应的字段名
            # step3:run() 中写入表头时，添加对应字段即可
            totalPrice = ''.join(ele.xpath(".//div[@class='totalPrice totalPrice2']//text()"))  # 总计
            unitPrice = ele.xpath(".//div[@class='unitPrice']//text()")  # 单价
            info.append(dict(zip(self.tabhead,
                                 [house[0], positionInfo, houseInfo[0], totalPrice, unitPrice[0]])))
        return info

    # @staticmethod
    def save_file(self, filename, data):
        """
        保存文件
       :param filename:
        :param data:
        :return:
        """
        # with open(filename, "w", encoding="utf-8") as f:
        #     f.write(data)

        # for i in data:
        #     # print(i)
        #     with open(filename, "a", encoding="utf-8") as f:
        #         f.write(str(i))
        #         # TypeError: write() argument must be str, not dict

        with open(filename, "a", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, self.tabhead)
            # writer.writeheader()
            # writer.writerow(dict(zip(self.tabhead, ["房屋", "位置", "房屋信息", "总价", "单价"])))  # 写入表头;bug 再次请求，重复添加
            writer.writerows(data)

    def run(self, pg):
        """
        执行函数
        :param pg:
        :return:
        """
        # 表头添加,修复多页时，表头重复
        filename = join(dirname(__file__), f"链家前{pg}页_{time.strftime('%Y%m%d%H%M%S')}.csv")
        with open(filename, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, self.tabhead)
            writer.writerow(dict(zip(self.tabhead, ["房屋", "位置", "房屋信息", "总价", "单价"])))  # 写入表头;bug 再次请求，重复添加

        for i in range(pg):
            url = self.url.format(f"pg{i + 1}")  # url
            resp = self.get_req(url)
            res = self.parse_html(resp)
            self.save_file(filename, res)
            time.sleep(random.uniform(2.0, 3.0))  # 等待


if __name__ == "__main__":
    page = 2
    LianJia().run(page)