导入相关包
import requests
from lxml import etree
import re
定义爬取函数
with open('SHhoure.txt', 'w',encoding='utf-8-sig') as f:
def page_parse(url):
requests.adapters.DEFAULT_RETRIES =5
requests.session().keep_alive = False
headers = {'User-Agent': 'Mozilla/5.0'}
proxy = '27.159.164.164'
proxies = {'http': 'http://' + proxy, 'https': 'https://' + proxy}
response = requests.get(url, headers=headers,proxies=proxies)
html=etree.HTML(response.text)
hrefs = html.xpath('//div[@class="title"]/a/@href')
for href in hrefs:
contents_list=requests.get(href,headers=headers).text
contents=etree.HTML(contents_list)
houseCode = ''.join(contents.xpath('//div[4]/@data-lj_action_resblock_id')) # 房源id
lon = re.search(r"resblockPosition:'(.+),(.+)'",contents_list).group(1) # 经度
lat = re.search(r"resblockPosition:'(.+),(.+)'", contents_list).group(2) # 纬度
province = contents.xpath('//meta[@name="location"]/@content')[0].split(';')[0].replace('province=','') # 所在省份
city = contents.xpath('//meta[@name="location"]/@content')[0].split(';')[1].replace('city=', '') # 所在城市
district=contents.xpath('//div[@class="deal-bread"]/a[3]/text()')[0].replace('二手房成交','')+'区' #主城区
sub_district=contents.xpath('//div[@class="deal-bread"]/a[4]/text()')[0].replace('二手房成交','') #下辖区
resblockId = contents.xpath('//div[4]/@data-lj_action_housedel_id')[0] # 楼盘id
title = ''.join(contents.xpath('//div[@class="wrapper"]/text()')).split(' ')[0] # 房源名称
deal_date = ''.join(contents.xpath('//div[@class="wrapper"]/span/text()')).split(' ')[0] # 成交日期
dealTotalPrice = contents.xpath('//div[@class="price"]/span/i/text()')[0] # 成交价格
avgprice = contents.xpath('//div[@class="price"]/b/text()')[0] # 成交均价
columns_lista = contents.xpath(
'//div[@class="msg"]/span/text()') # ['挂牌价格(万)', '成交周期(天)', '调价(次)', '带看(次)', '关注(人)', '浏览(次)']
infos_lista = contents.xpath('//div[@class="msg"]/span/label/text()') # columns_listb对应数值
columns_listb = contents.xpath(
'//span[@class="label"]/text()[1]') # ['房屋户型', '所在楼层', '建筑面积', '户型结构', '套内面积', '建筑类型', '房屋朝向', '建成年代', '装修情况', '建筑结构', '供暖方式', '梯户比例', '产权年限', '配备电梯', '链家编号', '交易权属', '挂牌时间', '房屋用途', '房屋年限', '房权所属']
infos_listb = contents.xpath('//li/text()')[:20] # columns_listc对应数值
house = {'房源名称': title,'房源id':houseCode,'经度':lon,'纬度':lat,
'所在省份':province,'所在城市':city,'主城市':district,'下辖区':sub_district,
'楼盘id':resblockId,
'成交日期': deal_date, '成交价格(万)': dealTotalPrice, '成交均价(元/平)': avgprice}
for infosa, columnsa in zip(infos_lista, columns_lista):
house_contenta = {str(columnsa): infosa.strip()}
house.update(house_contenta)
for infosb, columnsb in zip(infos_listb, columns_listb):
house_contentb = {str(columnsb): infosb.strip()}
house.update(house_contentb)
f.write("%s\n" % house)
def main():
position_list = ['pudong', 'minhang', 'baoshan', 'xuhui', 'putuo', 'yangpu',
'changning', 'songjiang', 'jiading', 'huangpu', 'jingan', 'hongkou', 'qingpu', 'fengxian',
'jinshan', 'chongming', 'shanghaizhoubian']
url = 'https://sh.lianjia.com/chengjiao/pg1/'
for position in position_list:
for i in range(1, 101):
url = 'https://sh.lianjia.com/chengjiao/{}/pg{}/'.format(position, i)
page_parse(url)
if __name__ == '__main__':
main()
数据集样式
{'房源名称': '高兴花园',
'房源id': '107101754775',
'经度': '121.409033',
'纬度': '31.117302',
'所在省份': '上海',
'所在城市': '上海',
'主城市': '闵行区',
'下辖区': '春申',
'楼盘id': '5011000014981',
'成交日期': '2019.09.18',
'成交价格(万)': '258',
'成交均价(元/平)': '69542',
'挂牌价格(万)': '265',
'房屋户型': '1室1厅1厨1卫',
'所在楼层': '高楼层(共6层)',
'建筑面积': '37.1㎡',
'户型结构': '平层',
'套内面积': '暂无数据',
'建筑类型': '板楼',
'房屋朝向': '南',
'建成年代': '1996',
'装修情况': '精装',
'建筑结构': '砖混结构',
'供暖方式': '',
'梯户比例': '一梯四户',
'产权年限': '70年',
'配备电梯': '无',
'链家编号': '107101754775',
'交易权属': '商品房',
'挂牌时间': '2019-09-15',
'房屋用途': '普通住宅',
'房屋年限': '暂无数据',
'房权所属': '非共有',
'成交周期(天)': '4',
'调价(次)': '0',
'带看(次)': '3',
'关注(人)': '0',
'浏览(次)': '1'}
网友评论