- 练手系列,直接上代码
import requests
import time
from lxml import etree
with open('C:\\Users\\HY\\Desktop\\book_douban.csv','w',encoding = 'utf-8') as f:
for a in range(10):
url = 'https://book.douban.com/top250?start={}'.format(a*25)
data = requests.get(url).text
s = etree.HTML(data)
book = s.xpath('//*[@id="content"]/div/div[1]/div/table')
time.sleep(3)
for div in book:
title = div.xpath('./tr/td[2]/div[1]/a/@title')[0]
href = div.xpath('./tr/td[2]/div[1]/a/@href')[0]
score = div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
num = div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip("(").strip().strip(")")
scribe = div.xpath('./tr/td[2]/p[2]/span/text()')
if len(scribe) > 0:
f.write('{},{},{},{},{}\n'.format(title,href,score,num,scribe[0]))
else:
f.write('{},{},{},{}\n'.format(title,href,score,num))
-
输出情况如图(数据过多,截取部分)
TOP250图书信息.png
网友评论