任务:
获取三页小猎短租房租信息,并写入到数据库中
http://sh.xiaozhu.com/search-duanzufang-p{}-0/
成果:
Snip20170525_3.png代码:
from bs4 import BeautifulSoup
import requests
import pymongo
client = pymongo.MongoClient('localhost',27017)
xiaozhu = client['xiaozhu']
house_info = xiaozhu['house_info']
def SexJudge(sex):
if sex == 'member_ico':
sex = 'man'
else:
sex = 'woman'
return sex
def GetInfo(houseUrl):
wb_data = requests.get(houseUrl)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('head > title')
addresses = soup.select('span.pr5')
rents = soup.select('#pricePart > div.day_l > span')
housePics = soup.select('#curBigImage')
landlordPics = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
sexes = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')
for title,address,rent,housePic,landlordPic,sex in zip(titles,addresses,rents,housePics,landlordPics,sexes):
data = {
'title':title.get_text(),
'address':address.get_text().split()[0],
'rent':int(rent.get_text()),
'housePic':housePic.get('src'),
'landlordPic':landlordPic.get('src'),
'sex':SexJudge(''.join(sex.get('class'))),
}
print(data)
return data
menuUrl = ['http://sh.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1,4)]
def GetHouseUrl(menuUrl):
data = []
n = 0
for url in menuUrl:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
houseUrls = soup.select('#page_list > ul > li > a')
for houseUrl in houseUrls:
data.insert(-1,houseUrl.get('href'))
n = n+1
print('Complete Page ',n)
return data
n=0
for houseUrl in GetHouseUrl(menuUrl):
house_info.insert_one(GetInfo(houseUrl)) #将详情写入数据库
n = n+1
print('Complete House ',n)
'''
(>) 大于 - $gt
(<) 小于 - $lt
(>=) 大于等于 - $gte
(<= ) 小于等于 - $lte
'''
for item in house_info.find({'rent':{'$gt':500}}):
print(item)
参考资料:
MongoDB 教程:http://www.runoob.com/mongodb/mongodb-operators.html
网友评论