#!/usr/bin/env python
# encoding: utf-8
"""
Author: ISeeMoon
Python: 3.6
Software: PyCharm
File: Lj_async.py
Time: 2018/5/6 15:26
"""
import requests
from lxml import etree
import asyncio
import aiohttp
import pandas
import re
import math
import time
loction_info = ''' 1→杭州
2→武汉
3→北京
按ENTER确认:'''
loction_select = input(loction_info)
loction_dic = {'1':'hz',
'2':'wh',
'3':'bj'}
city_url = 'https://{}.lianjia.com/ershoufang/'.format(loction_dic[loction_select])
down = input('请输入价格下限(万):')
up = input('请输入价格上限(万):')
inter_list = [(int(down),int(up))]
def half_inter(inter):
lower = inter[0]
upper = inter[1]
delta = int((upper-lower)/2)
inter_list.remove(inter)
print('已经缩小价格区间',inter)
inter_list.append((lower, lower+delta))
inter_list.append((lower+delta, upper))
pagenum = {}
def get_num(inter):
url = city_url + 'bp{}ep{}/'.format(inter[0],inter[1])
r = requests.get(url).text
num = int(etree.HTML(r).xpath("//h2[@class='total fl']/span/text()")[0].strip())
pagenum[(inter[0],inter[1])] = num
return num
totalnum = get_num(inter_list[0])
judge = True
while judge:
a = [get_num(x)>3000 for x in inter_list]
if True in a:
judge = True
else:
judge = False
for i in inter_list:
if get_num(i) > 3000:
half_inter(i)
print('价格区间缩小完毕!')
url_lst = []
url_lst_failed = []
url_lst_successed = []
url_lst_duplicated = []
for i in inter_list:
totalpage = math.ceil(pagenum[i]/30)
for j in range(1,totalpage+1):
url = city_url + 'pg{}bp{}ep{}/'.format(j,i[0], i[1])
url_lst.append(url)
print('url列表获取完毕!')
info_lst = []
async def get_info(url):
async with aiohttp.ClientSession() as session:
async with session.get(url,timeout=5) as resp:
if resp.status != 200:
url_lst_failed.append(url)
else:
url_lst_successed.append(url)
r = await resp.text()
nodelist = etree.HTML(r).xpath("//ul[@class='sellListContent']/li")
# print('-------------------------------------------------------------')
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url),len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('开始抓取第{}个页面的数据,共计{}个页面'.format(url_lst.index(url), len(url_lst)))
# print('-------------------------------------------------------------')
info_dic = {}
index = 1
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
print('开始抓取{}'.format(resp.url))
for node in nodelist:
try:
info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
except:
info_dic['title'] = '/'
try:
info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
except:
info_dic['href'] = '/'
try:
info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ','').split('|')[0]
except:
info_dic['xiaoqu'] = '/'
try:
info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[1]
except:
info_dic['huxing'] = '/'
try:
info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
except:
info_dic['area'] = '/'
try:
info_dic['chaoxiang'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
except:
info_dic['chaoxiang'] = '/'
try:
info_dic['zhuangxiu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
except:
info_dic['zhuangxiu'] = '/'
try:
info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[5]
except:
info_dic['dianti'] = '/'
try:
info_dic['louceng'] = re.findall('\((.*)\)',node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['louceng'] = '/'
try:
info_dic['nianxian'] = re.findall('\)(.*?)年',node.xpath(".//div[@class='positionInfo']/text()")[0])
except:
info_dic['nianxian'] = '/'
try:
info_dic['guanzhu'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ','').split('/')[0]))
except:
info_dic['guanzhu'] = '/'
try:
info_dic['daikan'] = ''.join(re.findall('[0-9]',node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
except:
info_dic['daikan'] = '/'
try:
info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
except:
info_dic['fabu'] = '/'
try:
info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
except:
info_dic['totalprice'] = '/'
try:
info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价','')
except:
info_dic['unitprice'] = '/'
if True in [info_dic['href'] in dic.values() for dic in info_lst]:
url_lst_duplicated.append(info_dic)
else:
info_lst.append(info_dic)
print('第{}条: {}→房屋信息抓取完毕!'.format(index,info_dic['title']))
index += 1
info_dic = {}
start = time.time()
#首次抓取url_lst中的信息,部分url没有对其发起请求,不知道为什么
tasks = [asyncio.ensure_future(get_info(url)) for url in url_lst]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
#将没有发起请求的url放入一个列表,对其进行循环抓取,直到所有url都被发起请求
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed or url_lst_failed:
url_lst_unrequested.append(url)
while len(url_lst_unrequested) > 0:
tasks_unrequested = [asyncio.ensure_future(get_info(url)) for url in url_lst_unrequested]
loop.run_until_complete(asyncio.wait(tasks_unrequested))
url_lst_unrequested = []
for url in url_lst:
if url not in url_lst_successed:
url_lst_unrequested.append(url)
end = time.time()
print('当前价格区间段内共有{}套二手房源\(包含{}条重复房源\),实际获得{}条房源信息。'.format(totalnum,len(url_lst_duplicated),len(info_lst)))
print('总共耗时{}秒'.format(end-start))
df = pandas.DataFrame(info_lst)
df.to_csv(r"C:\test\ljwh.csv",encoding='gbk')
##################同步爬取##########################
# info_lst = []
#
# start1 = time.time()
# for url in url_lst:
# resp = requests.get(url)
# nodelist = etree.HTML(resp.text).xpath("//ul[@class='sellListContent']/li")
# info_dic = {}
# index = 1
# print('开始抓取{}'.format(resp.url))
# print('开始抓取{}'.format(resp.url))
# print('开始抓取{}'.format(resp.url))
# for node in nodelist:
# try:
# info_dic['title'] = node.xpath(".//div[@class='title']/a/text()")[0]
# except:
# info_dic['title'] = '/'
# try:
# info_dic['href'] = node.xpath(".//div[@class='title']/a/@href")[0]
# except:
# info_dic['href'] = '/'
# try:
# info_dic['xiaoqu'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
# 0]
# except:
# info_dic['xiaoqu'] = '/'
# try:
# info_dic['huxing'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
# 1]
# except:
# info_dic['huxing'] = '/'
# try:
# info_dic['area'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[2]
# except:
# info_dic['area'] = '/'
# try:
# info_dic['chaoxiang'] = \
# node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[3]
# except:
# info_dic['chaoxiang'] = '/'
# try:
# info_dic['zhuangxiu'] = \
# node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[4]
# except:
# info_dic['zhuangxiu'] = '/'
# try:
# info_dic['dianti'] = node.xpath(".//div[@class='houseInfo']")[0].xpath('string(.)').replace(' ', '').split('|')[
# 5]
# except:
# info_dic['dianti'] = '/'
# try:
# info_dic['louceng'] = re.findall('\((.*)\)', node.xpath(".//div[@class='positionInfo']/text()")[0])
# except:
# info_dic['louceng'] = '/'
# try:
# info_dic['nianxian'] = re.findall('\)(.*?)年', node.xpath(".//div[@class='positionInfo']/text()")[0])
# except:
# info_dic['nianxian'] = '/'
# try:
# info_dic['guanzhu'] = ''.join(
# re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[0]))
# except:
# info_dic['guanzhu'] = '/'
# try:
# info_dic['daikan'] = ''.join(
# re.findall('[0-9]', node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[1]))
# except:
# info_dic['daikan'] = '/'
# try:
# info_dic['fabu'] = node.xpath(".//div[@class='followInfo']/text()")[0].replace(' ', '').split('/')[2]
# except:
# info_dic['fabu'] = '/'
# try:
# info_dic['totalprice'] = node.xpath(".//div[@class='totalPrice']/span/text()")[0]
# except:
# info_dic['totalprice'] = '/'
# try:
# info_dic['unitprice'] = node.xpath(".//div[@class='unitPrice']/span/text()")[0].replace('单价', '')
# except:
# info_dic['unitprice'] = '/'
# if True in [info_dic['href'] in dic.values() for dic in info_lst]:
# url_lst_duplicated.append(info_dic)
# else:
# info_lst.append(info_dic)
# print('第{}条: {}→房屋信息抓取完毕!'.format(index, info_dic['title']))
# index += 1
# info_dic = {}
# end = time.time()
# print('实际获得{}条房源信息。'.format(len(info_lst)))
# print('总共耗时{}秒'.format(end-start))
网友评论