我有时候会上豆瓣上看书评,一般是通过这个标签页面来找:
Paste_Image.png但是这个页面不像淘宝,没有筛选功能,所以用打算用爬虫爬下来自己筛选。
我主要爬取了这几个信息:标题、评分、阅读人数、页数、出版日期和价格。这些是我看书比较关注的东西。爬取下来的数据我选择直接存入pandas中的DataFrame来进行筛选。
下面是代码
\#-*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import urllib
import re
import pandas as pd
\#豆瓣这个页面没有反爬虫,所以不需要伪装成浏览器。
def findTag(url):
source_code = urllib.request.urlopen(url)
soup = BeautifulSoup(source_code,"html.parser")
return soup
def findTitle(soup):
titles = []
titletag = soup.findAll('h2',{'class':True})
for title in titletag:
t = title.get_text()
t = re.sub('\n','',t)
t = re.sub(' ','',t)
titles.append(t)
return (titles)
def findRating(soup):
ratin = soup.findAll('div',{'class':'star clearfix'})
rating = []
for item in ratin:
try:
r = item.find('span',{'class':'rating_nums'}).get_text()#有可能没有rating
r = float(r)
except:
r = 0.0
rating.append(r)
return rating
def findPopularity(soup):
popularity = []
popu = soup.findAll('div',{'class':'star clearfix'})
for item in popu:
p = item.find('span',{'class':'pl'}).get_text()
p = re.sub('\n','',p)
p = re.sub(' ','',p)
p = re.sub('人评价\)','',p)
p = re.sub('\(','',p)
p = re.sub('少于','',p)
p = float(p)
popularity.append(p)
return popularity
def findInfor(soup):#这里注意,整个爬虫中最耗时的是urlopen()函数,尽量少用,能合并就合并
thickness = []
year = []
price = []
thick = soup.findAll('h2',{'class':True})
for item in thick:
href = item.find('a').attrs['href']
soup1 = BeautifulSoup(urllib.request.urlopen(href),"html.parser")
thickne = soup1.find('span',text = re.compile('页数')).next_sibling
thickness.append(thickne)
yea = soup1.find('span',text = re.compile('出版年')).next_sibling
year.append(yea)
pric = soup1.find('span',text = re.compile('定价')).next_sibling
price.append(pric)
infor = [thickness,year,price]
return infor
def switchPages(keyword):
book_title_list = []
rating_list = []
popularity_list = []
thickness_list = []
year_list = []
price_list = []
for i in range(1):
page = "https://book.douban.com/tag/"+urllib.parse.quote(keyword)+"?start="+str(20*i)+"&type=T"
a1 = findTag(page)
b1 = findTitle(a1)
book_title_list.extend(b1)
b3 = findRating(a1)
rating_list.extend(b3)
b4 = findPopularity(a1)
popularity_list.extend(b4)
b5 = findInfor(a1)
thickness_list.extend(b5[0])
year_list.extend(b5[1])
price_list.extend(b5[2])
print(len(book_title_list),len(rating_list),len(popularity_list),len(thickness_list),len(year_list),len(price_list))
df = pd.DataFrame({'Title':book_title_list,'rating':rating_list,'popularity':popularity_list,'thickness':thickness_list,'year':year_list,'price':price_list})
print(df)
switchPages("编程")
网友评论