【爬虫篇】:爬取小猪短租网

作者: dataheart | 来源:发表于2016-05-15 22:38 被阅读383次

【爬虫篇】:爬取小猪短租网
爬虫：爬取小猪短租网
python爬虫实战——爬取北京地区短租房信息(基于BS4)
第三节练习项目：爬取租房信息
第一节练习项目：在 MongoDB 中筛选房源
Python实战学习笔记：爬取租房网站信息
python爬取小猪短租网信息
爬取小猪短租内容
python使用scrapy框架爬取小猪短租
爬虫： example three -- 爬取小猪短租的信息

成果：

Paste_Image.png

代码
from bs4 import BeautifulSoup
import requests
import re
import time

page_link = []
#>>>>用户保存每个详情页面的链接，到时直接遍历就行了

def get_page_link(page_number):
for each_number in range(1,page_number):
# 每页24个链接，这里面直接输入页面
full_url = 'http://cd.xiaozhu.com/search-duanzufang-p{}-0/'.format(each_number)
wb_data = requests.get(full_url)
soup = BeautifulSoup(wb_data.text,'lxml')
for link in soup.select('#page_list > ul > li > a'):
links = link.get('href')
page_link.append(links)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36'}
url_saves = 'http://cd.xiaozhu.com/'
url = ['http://cd.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,10,1)]

def get_attractions(url,data=None):

第一层---查询页面的查询

wb_data = requests.get(url_saves)
soup = BeautifulSoup(wb_data.text,'lxml')

title = soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname > div > a > span')
image = soup .select('#page_list > ul > li > a > img')
price = soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname > span.result_price > i')
evaluation = soup.select('#page_list > ul > li > div.result_btm_con.lodgeunitname > div > em > span')
link = soup.select('#page_list > ul > li > a')
    
for titles,images,prices,evaluations in zip(title,image,price,evaluation):
    data = {
    'titles':titles.get_text(),
    'images':images.get('lazy_src'),
    'prices':prices.get_text(),
    'evaluations':evaluations.get_text()
    }
    print(data)

查询页面中每个出租信息爬取

for links in link:

    getlink = links.get('href')
    #getlink = 'http://cd.xiaozhu.com/fangzi/731556139.html'
    get_data = requests.get(getlink)
    getsoup = BeautifulSoup(get_data.text,'lxml')
    title1 = getsoup.select('div.pho_info > h4')[0].text
# 因为是单页面，使用 select 方法获得的元素又是一个列表，那么列表中的第一个元素且也是唯一一个元素即是我们要找的信息 用 “[0]” 索引将其取出
                
    address = getsoup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5')[0].text
    price1 = getsoup.select('#pricePart > div.day_l > span')[0].text
    photo = getsoup.select('#curBigImage')[0].get('src')#出现列表用【0】前后放置
    homename = getsoup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')[0].text
    homegender = getsoup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')[0]
    #homegender = getsoup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div')[0].get('class')---->>>['member_ico']
    #要拿出['']里面的内容get()[0],表示列表中第一个字段
    
    def  get_gender(classname):
        if classname == 'member_ico':
            return '男'
        if classname == 'member_ico1':
            return '女'
    
    data1={'title1s':title1,'addresss':address,'price1s':price1,'photos':photo,
    'homenames':homename,'homelinks':homelink,'homegenders':get_gender(homegender)}    
    #提供了一个启发函数调用的时候==get('') or  ==get.text()
    print(data1)