爬虫自如结果
- 自如结果
自如房价是个坑,每次随机加载出来一个不规则命名的图片,图片上的数字都是乱序
思路1:将图片对应数字生成字典,录入几页数据当作本地库,每次从本地库调用对应的数字 -- 结果败北,每刷新一次,图片地址就会发生对应改变,需要对应本地库有些庞大
思路2:利用orc进行图片识别,将像素位置和数字联系在一起--结果败北,看了几个图像识别的教程,无奈没看懂...2333
思路3:手动输入房价,将数据对齐--结果不是很靠谱,每次提取信息,房子信息会随机变动1-2个,这样导致对应不上,提取了3页,有大概10个左右信息没对应上
- 蛋壳结果
蛋壳就比较规整,数据提取方便,不用登陆,不用cookie,直接常规操作即可
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 22 01:14:37 2019
@author: 无敌钢牙小白狼
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def get_cookie(data):
a = []
b = []
temp = data.split(';')
for i in temp:
a.append(i.split('=')[0].strip())
b.append(i.split('=')[1].strip())
dic = dict(zip(a,b))
return dic
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text,"lxml")
return soup
def get_data(url):
soup = get_soup(url)
#title
titles = soup.select('body > div.website-main > div > div.roomlist > \
div.list-wrapper > div.r_ls_box > div > div.r_lbx_cen > div.r_lbx_cena > a')
href = []
metro= []
plot = []
nums = []
for i in titles:
temp = i.string.strip()
metro.append(temp.split()[0])
plot.append(temp.split()[1])
nums.append(temp.split()[2])
temp2 = str(i).split('href="')[1].split('"')[0]
href.append(temp2)
#loc
locs = soup.select('body > div.website-main > div > div.roomlist > div.list-wrapper \
> div.r_ls_box > div > div.r_lbx_cen > div.r_lbx_cena > div')
loc = []
for i in locs:
temp = str(i).split('</div>')[1].strip()
loc.append(temp)
#info
info = soup.select('body > div.website-main > div > div.roomlist > div.list-wrapper\
> div.r_ls_box > div > div.r_lbx_cen > div.r_lbx_cenb')
area = []
floor = []
towards = []
for i in info:
temp = str(i).split('</div>')[1].strip()
area.append(float(temp.split('|')[0].split('约')[1].split('㎡')[0].strip()))
floor.append(temp.split('|')[1].strip())
towards.append(temp.split('|')[3].split('<i>')[0].strip())
#price
price_info = soup.select('body > div.website-main > div > div.roomlist > div.list-wrapper\
> div.r_ls_box > div > div.r_lbx_money > div > span')
price = []
for i in price_info:
if '元' in i.string:
pass
else:
temp = str(i).split('_b">')[1].split('</')[0]
price.append(int(temp))
df = pd.DataFrame({'metro':metro, 'towards':towards, 'nums':nums,'loc':loc,\
'plot':plot,'area':area, 'floor':floor, 'href':href,'price':price})
return df
# 获取多页结果
def get_df(url):
df = pd.DataFrame()
for i in range(5):
url = url+'&page='+str(i+1)
temp = get_data(url)
df = pd.concat([df,temp],axis=0)
time.sleep(3)
print('ok')
return df
# 获取多个地方的搜索
def run(url_list,name_list):
dic = dict(zip(url_list,name_list))
df = pd.DataFrame()
for key in dic:
url = 'https://www.danke.com/room/sz?search=1&search_text='+key+'&from=home'
temp = get_df(url)
df = pd.concat([df, temp],axis=0)
print(dic[key],'完成')
time.sleep(3)
return df
# 筛选结果
def handle_df(df):
df['price/area'] = df['price']/df['area']
df.sort_values(by=['price/area'], inplace=True)
df_result = df[df.price <= 2500]
df_result = df_result[df_result.area >=8]
return df_result
#%%
if __name__ == '__main__':
url_list = ['%E6%99%AF%E7%94%B0','%E8%8E%B2%E8%8A%B1%E5%8C%97','%E9%A6%99%E8%9C%9C',\
'%E5%AE%89%E6%89%98%E5%B1%B1','%E5%B8%82%E6%B0%91%E4%B8%AD%E5%BF%83']
name_list = ['景田','莲花北','香蜜','安托山','市民中心']
df = run(url_list, name_list)
df_result = handle_df(df)
-
可以改善的地方:
1,在匹配信息的时候用正则表达式会简便很多
2,数据提取的时候还是有些冗杂,考虑批量处理
输出结果:
蛋壳筛选后结果
自如筛选后结果
ps: 从price/area来看 蛋壳性价比高些
写法二:
更新xpath写法
import requests
from lxml import etree
import time
def get_data(url, header)
r = requests.get(url,header)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = etree.HTML(r.text)
#title
titles = html.xpath('//*[@id="resultList"]/div/p/span/a')
title_info = []
for title in titles:
temp = title.text
title_info.append(temp.split()[0])
网友评论