成果:

代码
from bs4 import BeautifulSoup
import requests
import re
import time
page_link = []
#>>>>用户保存每个详情页面的链接,到时直接遍历就行了
def get_page_link(page_number):
for each_number in range(1,page_number):
# 每页24个链接,这里面直接输入页面
full_url = 'http://cd.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number)
wb_data = requests.get(full_url)
soup = BeautifulSoup(wb_data.text,'lxml')
for link in soup.select('#page_list > ul > li > a'):
links = link.get('href')
page_link.append(links)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'}
url_saves = 'http://cd.xiaozhu.com/'
url = ['http://cd.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,10,1)]
def get_attractions(url,data=None):
第一层---查询页面的查询
wb_data = requests.get(url_saves)
soup = BeautifulSoup(wb_data.text,'lxml')
title = soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname > div > a > span')
image = soup .select('#page_list > ul > li > a > img')
price = soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname > span.result_price > i')
evaluation = soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em > span')
link = soup.select('#page_list > ul > li > a')
for titles,images,prices,evaluations in zip(title,image,price,evaluation):
data = {
'titles':titles.get_text(),
'images':images.get('lazy_src'),
'prices':prices.get_text(),
'evaluations':evaluations.get_text()
}
print(data)
查询页面中每个出租信息爬取
for links in link:
getlink = links.get('href')
#getlink = 'http://cd.xiaozhu.com/fangzi/731556139.html'
get_data = requests.get(getlink)
getsoup = BeautifulSoup(get_data.text,'lxml')
title1 = getsoup.select('div.pho_info > h4')[0].text
# 因为是单页面,使用 select 方法获得的元素又是一个列表,那么列表中的第一个元素且也是唯一一个元素即是我们要找的信息 用 “[0]” 索引将其取出
address = getsoup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')[0].text
price1 = getsoup.select('#pricePart > div.day_l > span')[0].text
photo = getsoup.select('#curBigImage')[0].get('src')#出现列表用【0】前后放置
homename = getsoup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].text
homegender = getsoup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')[0]
#homegender = getsoup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')---->>>['member_ico']
#要拿出['']里面的内容get()[0],表示列表中第一个字段
def get_gender(classname):
if classname == 'member_ico':
return '男'
if classname == 'member_ico1':
return '女'
data1={'title1s':title1,'addresss':address,'price1s':price1,'photos':photo,
'homenames':homename,'homelinks':homelink,'homegenders':get_gender(homegender)}
#提供了一个启发函数调用的时候==get('') or ==get.text()
print(data1)
调用函数
for single_umls in url:
print(single_umls)
get_attractions(single_umls)
网友评论