BeautifulSoup库的使用:tag对象的常用方法
- select
- find_all:
- parent
- string
- get('id')
# encoding: utf-8
"""
@author: Hammurabi
@contact: breakawayroad@gmail.com
@site: http://
@software: PyCharm
@file: web_parser.py
@time: 8/30/16 9:34 PM
"""
from bs4 import BeautifulSoupdef
path = './index.html'with open(path, 'r') as wb_data:
soup = BeautifulSoup(wb_data, 'lxml')
imgs = soup.select('body > div > div > div.col-md-9 > div > div > div > img')
prices = soup.select('h4.pull-right')
names = soup.select('div.caption > h4 > a')
reviews = soup.select('div.ratings > p.pull-right')
ratings = soup.select('div.ratings')
for img, price, name, review, rating in zip(imgs, prices, names, reviews, ratings):
img_url = img.get('src')
price = price.string
name = name.string
review_string = rating.find_all('p', class_='pull-right')[0].string
review_cnt = int(review_string[0:review_string.find(' reviews')])
star_cnt = len(rating.find_all('span', class_='glyphicon-star'))
data = {
'image': img_url,
'price': price,
'name': name,
'review': review_cnt,
'stars': star_cnt,
}
print(data)
网友评论