from bs4 import BeautifulSoup
import lxml
import time
import requests
base_url = 'http://bj.58.com/pbdn/0/pn'
def get_website(url): #爬取每件商品网址,最后执行爬取详细信息函数
response = requests.get(url)
time.sleep(2)
soup = BeautifulSoup(response.text,'lxml')
# wb_sites = soup.select('#infolist > div.infocon > table > tbody > tr.jztr > td.img > a')
wb_sites = soup.select('#infolist > div.infocon > table > tbody > tr.zzinfo > td.img > a') #除了推广商品,剩下的都是转转商品;没有发现题目要求的正常商品。所以此处改为抓取转转商品
for wb_site in wb_sites:
href = wb_site.get('href')
get_details(href)
def get_details(href): #爬取每件商品详细信息
time.sleep(2)
response = requests.get(href)
soup = BeautifulSoup(response.text,'lxml')
cates = soup.select('#nav > div.breadCrumb.f12')
items = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')
prices = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')
areas = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')
views = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > p > span.look_time')
for cate, item, price, area, view in zip(cates, items, prices, areas, views):
data = {
'cate' : list(cate.stripped_strings),
'item' : item.get_text(),
'price' : price.get_text(),
'area' : area.get_text(),
'view' : view.get_text()
}
print(data)
for page_number in range(1,5): #爬取前五页信息
url = base_url + str(page_number)
get_website(url)
网友评论