目标
- 爬取小猪短租房源信息,暂定爬取前三页
- 列表页爬取信息为
- 将爬取的信息存入到数据库
- 数据库检索房源
代码示例
# coding: UTF-8
'''
标题 div.result_btm_con.lodgeunitname > div.result_intro > a.sTitle > span
价格 div.result_btm_con.lodgeunitname > div> span.result_price > i
详情页链接 ul > li > a.resule_img_a
'''
from bs4 import BeautifulSoup
from pymongo import MongoClient
import requests, time
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Cookie': 'TY_SESSION_ID=76a72fb7-6086-43eb-8d89-e940bc6e0b7c; xz_guid_4se=23919729-5f3c-490e-a3bd-eaab133bf8a7; _ga=GA1.2.548704816.1530023215; gr_user_id=481ee4b7-fbbb-4780-878f-d557e50f43cd; __utma=29082403.548704816.1530023215.1530104445.1530107679.4; 59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_cs1=N%2FA; grwng_uid=4810f5c6-a7ea-4916-a2dd-0f6eca0735ae; _uab_collina=154600556723767889767416; abtest_ABTest4SearchDate=b; xzuuid=85d7e34e; 59a81cc7d8c04307ba183d331c373ef6_gr_session_id=4aecb865-3f10-45fa-907a-97bb62ec1c73; 59a81cc7d8c04307ba183d331c373ef6_gr_last_sent_sid_with_cs1=4aecb865-3f10-45fa-907a-97bb62ec1c73; 59a81cc7d8c04307ba183d331c373ef6_gr_session_id_4aecb865-3f10-45fa-907a-97bb62ec1c73=true',
}
host = 'localhost'
port = 27017
start_page, end_page = 1, 4
max_payment = 500
# 连接上Mongo DB
client = MongoClient(host, port)
# 创建数据库
xz_houses = client['xz_houses']
# 数据库中创建表
links_prices = xz_houses['links_prices']
def get_links_prices(url):
time.sleep(3)
# 发起请求,获取页码数据
wb_data = requests.get(url, headers=headers)
# 将页码放入BeautifulSoup进行解析
soup = BeautifulSoup(wb_data.text, 'lxml')
# 根据页面元素的样式获取需要抓取的内容html标签
titles = soup.select('div.result_btm_con.lodgeunitname > div.result_intro > a.sTitle > span')
prices = soup.select('div.result_btm_con.lodgeunitname > div > span.result_price > i')
links = soup.select('ul > li > a.resule_img_a')
# 标准化数据,存入数据库对应的表中
for title, price, link in zip(titles, prices, links):
data = {
'title': title.get_text(),
'price': price.get_text(),
'link': link.get('href'),
}
# 插入数据
links_prices.insert_one(data)
if __name__ == '__main__':
for i in range(start_page, end_page):
url = 'http://hz.xiaozhu.com/search-duanzufang-p{page}-0/'.format(page = str(i))
get_links_prices(url)
# 查找数据库表中价格大于max_payment的房源信息
print('价格大于{fee}的房源有:'.format(fee=max_payment))
for item in links_prices.find({'price':{'$gt': str(max_payment)}}):
print('标题:', item['title'], '\t\t 价格:', item['price'], '\t\t 详情:', item['link'])
网友评论