from bs4 import BeautifulSoup
import requests
url = 'http://bj.xiaozhu.com/fangzi/1508951935.html'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
title = soup.select('div.pho_info > h4')[0].text
address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p')[0].get('title')
price = soup.select('#pricePart > div.day_l > span')[0].text
pic = soup.select('#imgMouseCusor')[0].get('src')
host_name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].get('title')
# host_gender = soup.select('div.member_pic > div')[0].get('class')[0]
host_gender = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span')[0].get('class')[0]
def get_gender(gender):
if gender == 'member_girl_ico':
return '女'
if gender == 'member_boy_ico':
return '男'
get_gender('member_girl_ico')
data = {
'title':title,
'address':address,
'price':price,
'pic':pic,
'host_name':host_name,
'host_gender':get_gender(host_gender)
}
print(data)
page_link = []
def get_page_link(page_number):
for each_number in range(1,page_number):
full_url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number)
wb_data = requests.get(full_url)
soup = BeautifulSoup(wb_data.text,'lxml')
for link in soup.select('a.resule_img_a'):
page_link.append(link)
网友评论