我的结果
MongoDB in Pycharm
我的代码:
from bs4 import BeautifulSoup
import requests, pymongo
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(4)]
client = pymongo.MongoClient('localhost', 27017) # 连接客户端
walden = client['walden'] # 给数据库命名
xiaozu_3page = walden['xiaozu_3page'] # 给collection命名
# 从聚合页面获取租房页面
def get_pages(url):
data = requests.get(url)
soup = BeautifulSoup(data.text, 'lxml')
pages = soup.select('.resule_img_a')
for page in pages:
page = page.get('href')
duanzufang_info(page)
return
# 爬取租房页面信息
def duanzufang_info(href):
web_data = requests.get(href)
soup = BeautifulSoup(web_data.text, 'lxml')
title = soup.select('.pho_info em')[0].get_text()
addr = soup.select('.pho_info p')[0].get('title')
price = soup.select('.day_l span')[0].get_text()
area = soup.select('.border_none p')[0].get_text().split()[0]
house_type = soup.select('.border_none p')[0].get_text().split()[1]
for_people = soup.select('.h_ico2')[0].get_text()
bed_num = '床' + soup.select('.h_ico3')[0].get_text()
data = {
'title': title,
'address': addr,
'price': int(price),
'area': area,
'type': house_type,
'people': for_people,
'bed': '床'+bed_num
}
print(data)
# 把data写进MongoDB
xiaozu_3page.insert_one(data)
# 选出价格在500以上的
def find_fangzi():
for info in xiaozu_3page.find():
if info['price'] >= 500:
print(info)
# for url in urls:
# get_pages(url)
find_fangzi()
我的感想:
- 花了将近一个小时完成。
- 小猪的部分写起来还是很快的,既没有用headers、proxies,连time.sleep()都没有,粗暴。
- MongoDB可视化界面看起来好友好啊
- 一开始我自作多情把
price
写成 ¥+price
,后来出错了呃,因为没有用int,结果不能用>=筛选,而到mongodb中修改数据类型,却又把price全部弄成了0(应该是¥符号出了问题吧)。
网友评论