当当网图书爬虫
import requests
from lxml import html
1.获取目标站点的源代码
url = 'http://search.dangdang.com/?key=9787115428028&act=input'
html_data = requests.get(url).text
2.xpath提取
selector = html.fromstring(html_data)
#获取所有图书的价格
#price = selector.xpath('//div[@id="search_nature_rg"]/ul/li[1]/p[3]/span[1]/text()')
#print(price)
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print(len(ul_list))
3.遍历
for li in ul_list:
# 价格
price = li.xpath('p[3]/span[1]/text()')
print(price)
# 书名
name = li.xpath('p[1]/a[1]/text()')
print(name)
# 购买链接
link = li.xpath('p[1]/a[1]/@href')
print(link)
# 店铺名称
store = li.xpath('p[4]/a[1]/text()')
print(store)
当当网爬虫2.0
import requests
from lxml import html
import pandas as pd
def spider(isbn):
book_list = []
# 获取目标站点的源代码
# 如何格式化字符串-format方法
url = 'http://search.dangdang.com/?key={}&act=input'.format(isbn)
html_data = requests.get(url).text
# xpath提取
selector = html.fromstring(html_data)
ul_lsit = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('共有{}家售卖此书'.format(len(ul_lsit)))
# 遍历
for li in ul_lsit:
# 价格
price = li.xpath('p[3]/span[1]/text()')[0]
price = price.replace('¥', '')
price = float(price)
# 书名
book_name = li.xpath('p[@class="name"]/a/@title')[0]
# 购买链接
link = li.xpath('p[@class="name"]/a/@href')[0]
# 店铺名称
store = li.xpath('p[@class="search_shangjia"]/a[1]/text()')
if len(store) == 0:
store = '当当自营'
else:
store = store[0]
book_list.append({
'book_name': book_name,
'link':link,
'price':price,
'store':store
})
# 排序
book_list.sort(key=lambda x: x['price'], reverse=True)
# 重新遍历显示每家商铺的信息
for book in book_list:
print(book)
# 存储 csv
df = pd.DataFrame(book_list)
df.to_csv('dangdang.csv')
isbn = input('请输入书号')
# 9787115428028
spider(isbn)
网友评论