目的:用这个网址http://www.jianshu.com/p/ef1028a4668e介绍的方法所爬取的内容放入mongodb中,并查询价格超过500的信息。
<pre><code>from bs4 import BeautifulSoupimport requestsimport pymongoclient = pymongo.MongoClient('localhost',27017)xiaozu = client['xiaozu']xinxi = xiaozu['xinxi']def sex_judge(sex): for isex in sex: if isex == 'member_ico': return 'male' elif isex == 'member_ico1': return 'female' else: return 'None'end_page = input('end_page:')for i in range(1, int(end_page)): base_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) #print(base_url) html1 = requests.get(base_url) bsObj1 = BeautifulSoup(html1.text, 'lxml') detail_url = bsObj1.find_all('a', {'class':'resule_img_a'}) for i in detail_url: html2 = requests.get(i.get('href')) bsObj2 = BeautifulSoup(html2.text, 'lxml') image = bsObj2.select('#curBigImage')[0].get('src') landlord_image = bsObj2.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')[0].get('src') title = bsObj2.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')[0].get_text() address = list(bsObj2.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')[0].stripped_strings) price = bsObj2.select('#pricePart > div.day_l > span')[0].get_text() name = bsObj2.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get_text() sexs = sex_judge(bsObj2.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')) data = { 'image':image, 'landlord_image':landlord_image, 'title':title, 'address':address, 'price':int(price), 'name':name, 'sexs':sexs } xinxi.insert_one(data)for item in xinxi.find({'price':{'$gt':500}}): print(item)</code></pre>
网友评论