- 分析url
https://bj.lianjia.com/ershoufang/
https://bj.lianjia.com/ershoufang/pg2/
https://bj.lianjia.com/ershoufang/pg3/
...
# 第一页,https://bj.lianjia.com/ershoufang/pg1/也可以重定向到https://bj.lianjia.com/ershoufang/
- 确定爬取元素xpath
# 分析元素
"""
<div class="title"><a class="" href="https://" target="_blank" data-log_index="5" data-el="ershoufang" data-housecode="101115961312" data-is_focus="" data-sl="">新通国际南北通透两居室 双落地阳台 精装修 诚心出售</a><span class="goodhouse_tag tagBlock">必看好房</span></div>
<div class="positionInfo"><span class="positionIcon"></span><a href="https://" target="_blank" data-log_index="5" data-el="region">新通国际花园 </a> - <a href="https://" target="_blank">梨园</a> </div>
<div class="houseInfo"><span class="houseIcon"></span>2室1厅 | 94.05平米 | 南 北 | 其他 | 低楼层(共28层) | 板塔结合</div>
<div class="priceInfo"><div class="totalPrice totalPrice2"><i> </i><span class="">465</span><i>万</i></div><div class="unitPrice" data-hid="101115961312" data-rid="1111027381229" data-price="49442"><span>49,442元/平</span></div></div>
"""
.//div[@class='info clear'] # 单个房屋所有信息,注意1 .不可缺少 2 多类名时均需要写入
# 再从单个房屋所有信息中获取单一信息
.//a[@data-housecode]/text() # 房屋标题
.//div[@class='positionInfo']//text() # 房屋位置
.//div[@class='houseInfo']/text() # 房屋基本信息,如3房2厅等
.//div[@class='totalPrice totalPrice2']//text() # 房屋总价
.//div[@class='unitPrice']//text() # 房屋每平
- 提升效率
减少网络波动的影响,添加重试机制 - 其他
写入csv表头时,若封装在save_file()
,存在重复写入的情况,故写入表头放置run()
代码:
# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_lianjia.py
@time:2022/8/1
"""
from faker import Faker
from urllib.request import Request, urlopen
from os.path import join, dirname
# from urllib import parse
import time
import random
from lxml import etree
import csv
class LianJia:
"""
链家二手房数据爬取
"""
def __init__(self):
self.url = "https://bj.lianjia.com/ershoufang/{}"
self.tabhead = ["house", "positionInfo", "houseInfo", "totalPrice", "unitPrice"]
self.retry = 0 # 重试次数
# @staticmethod
def get_req(self, url):
"""
发起请求
:param url:
:return:
"""
# 为了提高效率,减少网络波动的影响,添加重试机制;且添加超时时间
if self.retry < 3:
# 请求超时,失败后重试3次
try:
ua = Faker(local="zh_CN").user_agent()
req = Request(url, headers={"User-Agent": ua}) # 缺少代理服务器
resp = urlopen(req, timeout=1).read().decode("utf-8") # 获取返回信息,字节类型
return resp
except Exception as e:
print(f"请求失败:{e}")
self.retry += 1
return self.get_req(url)
# @staticmethod
def parse_html(self, html):
"""
解析html
:param html:
:return:
"""
parse_html = etree.HTML(html)
divs = parse_html.xpath(".//div[@class='info clear']") # 少了.表示当前路径;2.多类名需要均写入
info = []
for ele in divs:
house = ele.xpath(".//a[@data-housecode]/text()")
positionInfo = ''.join(ele.xpath(".//div[@class='positionInfo']//text()"))
houseInfo = ele.xpath(".//div[@class='houseInfo']/text()")
# 3室2厅 | 199.38平米 | 西南 | 精装 | 中楼层(共22层) | 2011年建 | 塔楼;多条信息,可以考虑分列存储
# step1:x1,x2,x3,x4= s.split(" | ") # 拆包
# step2:self.tabhead中添加对应的字段名
# step3:run() 中写入表头时,添加对应字段即可
totalPrice = ''.join(ele.xpath(".//div[@class='totalPrice totalPrice2']//text()")) # 总计
unitPrice = ele.xpath(".//div[@class='unitPrice']//text()") # 单价
info.append(dict(zip(self.tabhead,
[house[0], positionInfo, houseInfo[0], totalPrice, unitPrice[0]])))
return info
# @staticmethod
def save_file(self, filename, data):
"""
保存文件
:param filename:
:param data:
:return:
"""
# with open(filename, "w", encoding="utf-8") as f:
# f.write(data)
# for i in data:
# # print(i)
# with open(filename, "a", encoding="utf-8") as f:
# f.write(str(i))
# # TypeError: write() argument must be str, not dict
with open(filename, "a", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, self.tabhead)
# writer.writeheader()
# writer.writerow(dict(zip(self.tabhead, ["房屋", "位置", "房屋信息", "总价", "单价"]))) # 写入表头;bug 再次请求,重复添加
writer.writerows(data)
def run(self, pg):
"""
执行函数
:param pg:
:return:
"""
# 表头添加,修复多页时,表头重复
filename = join(dirname(__file__), f"链家前{pg}页_{time.strftime('%Y%m%d%H%M%S')}.csv")
with open(filename, "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, self.tabhead)
writer.writerow(dict(zip(self.tabhead, ["房屋", "位置", "房屋信息", "总价", "单价"]))) # 写入表头;bug 再次请求,重复添加
for i in range(pg):
url = self.url.format(f"pg{i + 1}") # url
resp = self.get_req(url)
res = self.parse_html(resp)
self.save_file(filename, res)
time.sleep(random.uniform(2.0, 3.0)) # 等待
if __name__ == "__main__":
page = 2
LianJia().run(page)
网友评论