用Python在豆瓣上找书看

作者: dalalaa | 来源:发表于2017-01-17 21:51 被阅读331次

    我有时候会上豆瓣上看书评,一般是通过这个标签页面来找:

    Paste_Image.png

    但是这个页面不像淘宝,没有筛选功能,所以用打算用爬虫爬下来自己筛选。
    我主要爬取了这几个信息:标题、评分、阅读人数、页数、出版日期和价格。这些是我看书比较关注的东西。爬取下来的数据我选择直接存入pandas中的DataFrame来进行筛选。
    下面是代码

    \#-*- coding: UTF-8 -*-
    from bs4 import BeautifulSoup
    import urllib
    import  re
    import pandas as pd
    \#豆瓣这个页面没有反爬虫,所以不需要伪装成浏览器。
    def findTag(url):
        source_code = urllib.request.urlopen(url)
        soup = BeautifulSoup(source_code,"html.parser")
        return soup
    def findTitle(soup):
        titles = []
        titletag = soup.findAll('h2',{'class':True})
        for title in titletag:
            t = title.get_text()
            t = re.sub('\n','',t)
            t = re.sub(' ','',t)
            titles.append(t)
        return (titles)
    def findRating(soup):
        ratin = soup.findAll('div',{'class':'star clearfix'})
        rating = []
        for item in ratin:
            try:
                r = item.find('span',{'class':'rating_nums'}).get_text()#有可能没有rating
                r = float(r)
            except:
                r = 0.0
            rating.append(r)
        return rating
    def findPopularity(soup):
        popularity = []
        popu = soup.findAll('div',{'class':'star clearfix'})
        for item in popu:
            p = item.find('span',{'class':'pl'}).get_text()
            p = re.sub('\n','',p)
            p = re.sub(' ','',p)
            p = re.sub('人评价\)','',p)
            p = re.sub('\(','',p)
            p = re.sub('少于','',p)
            p = float(p)
            popularity.append(p)
        return popularity  
    def findInfor(soup):#这里注意,整个爬虫中最耗时的是urlopen()函数,尽量少用,能合并就合并
        thickness = []
        year = []
        price = []
        thick = soup.findAll('h2',{'class':True})
        for item in thick:
            href = item.find('a').attrs['href']
            soup1 = BeautifulSoup(urllib.request.urlopen(href),"html.parser")
            thickne = soup1.find('span',text = re.compile('页数')).next_sibling
            thickness.append(thickne)
            yea = soup1.find('span',text = re.compile('出版年')).next_sibling
            year.append(yea)
            pric = soup1.find('span',text = re.compile('定价')).next_sibling
            price.append(pric)
        infor = [thickness,year,price]
        return infor
    def switchPages(keyword):
        book_title_list = []
        rating_list = []
        popularity_list = []
        thickness_list = []
        year_list = []
        price_list = []
        for i in range(1):
            page = "https://book.douban.com/tag/"+urllib.parse.quote(keyword)+"?start="+str(20*i)+"&type=T"
            a1 = findTag(page)
            b1 = findTitle(a1)
            book_title_list.extend(b1)
            b3 = findRating(a1)
            rating_list.extend(b3)
            b4 = findPopularity(a1)
            popularity_list.extend(b4)
            b5 = findInfor(a1)
            thickness_list.extend(b5[0])
            year_list.extend(b5[1])
            price_list.extend(b5[2])
     print(len(book_title_list),len(rating_list),len(popularity_list),len(thickness_list),len(year_list),len(price_list)) 
        df = pd.DataFrame({'Title':book_title_list,'rating':rating_list,'popularity':popularity_list,'thickness':thickness_list,'year':year_list,'price':price_list})
        print(df)
    
    switchPages("编程")
    
    

    相关文章

      网友评论

        本文标题:用Python在豆瓣上找书看

        本文链接:https://www.haomeiwen.com/subject/cuhvbttx.html