下面是代码:
#! /usr/bin/env python
#-*- coding : utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import pymongo #操作数据库
client = pymongo.MongoClient('localhost',27017) #连接数据库
xiaozhu = client['xiaozhu'] #建立数据库xiaozhu
fangzi = xiaozhu['fangzi'] #在xiaozhu数据库中建立数据表fangzi
def get_fangzi_url(): #定义函数获取房子信息地址
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(i) for i in range(1, 2)]
fangzi_list = []
for url in urls:
web_data = requests.get(url)
web_data.encoding = 'utf-8'
soup = BeautifulSoup(web_data.text, 'lxml')
for fangzi_urls in soup.select('#page_list > ul > li > a'):
fangzi_list.append(fangzi_urls['href'])
return fangzi_list
def get_info_and_insert(): #定义函数获取房子信息并入库
urls = get_fangzi_url()
for url in urls:
web_data = requests.get(url)
web_data.encoding = 'utf-8'
content = web_data.text
soup = BeautifulSoup(content,'lxml')
fangzi_info = {} #定义字典,存放房子信息
fangzi_info['title'] = soup.select('em')[1].text
fangzi_info['area'] = soup.select('.pr5')[0].text.strip()
fangzi_info['price'] = soup.select('#pricePart .day_l span')[0].text
fangzi.insert_one(fangzi_info) #入库
print(url) #随意打印点信息,刷存在感
time.sleep(2) #慢慢来,休息2秒,防止封爬虫
def find_fangzi(): #定义函数筛选房源
for info in fangzi.find():
if int(info['price']) >= 300:
print(info)
#get_info_and_insert() #执行获取并入库
find_fangzi()
关键点:
- 确定信息地址
- 数据库操作
- 终于学会了markdown引用代码了,原来是键盘左上角tab键上面那个小点
网友评论