抓取的详情页链接
详情页的详细信息
from bs4 import BeautifulSoup
import requests
import time
sourceurls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 14)]
detail_urls = []
detailku = []
def get_detail_url(urls):
web_data = requests.get(urls)
time.sleep(2)
soup = BeautifulSoup(web_data.text, 'lxml')
for url in soup.select('#page_list > ul > li > a'):
detail_url = url.get('href')
detail_urls.append(detail_url)
print(detail_urls, len(detail_urls))
# sourceurls 是一个列表,所以还需要一个个的取出来放到函数里
for single_url in sourceurls:
get_detail_url(single_url)
def get_dtail_info(url):
web_data = requests.get(url)
time.sleep(1)
soup = BeautifulSoup(web_data.text, 'lxml')
titles = soup.select('h4 > em')
areas = soup.select('span.pr5')
day_prices = soup.select('div.day_l > span')
house_pics = soup.select('#curBigImage')
landlord_pics = soup.select('div.member_pic > a > img')
if soup.find_all('div', 'member_ico'):
landlord_genders = '男'
else:
landlord_genders = '女'
landlord_names = soup.select('a.lorder_name')
for title, area, day_price, house_pic, landlord_pic, landlord_gender, landlord_name in zip(titles, areas, day_prices,
house_pics, landlord_pics,
landlord_genders,
landlord_names):
data = {
'title': title.get_text(),
'area': area.get_text(),
'day_price': day_price.get_text(),
'house_pic': house_pic.get('src'),
'landlord_pic':landlord_pic.get('src'),
'landlord_gender': landlord_gender,
'landlord_name': landlord_name.get_text()
}
detailku.append(data)
print(data, len(detailku))
for detail_sinngle_url in detail_urls:
get_dtail_info(detail_sinngle_url)
网友评论