美文网首页
scrapy爬取豆瓣读书

scrapy爬取豆瓣读书

作者: pkxutao | 来源:发表于2018-01-24 10:01 被阅读213次

    scrapy是个非常强大的爬虫,简单配置后就可以上手,写了个小爬虫爬取豆瓣读书的书籍信息练练手。
    整个流程就是:随便找一本书的详情页作为入口,爬取这本书的详情信息,然后从详情页底部的“喜欢读"xxx"的人也喜欢”这个标签下爬取其他书籍,如此循环达到爬取所有书籍信息的目的(理论上)。
    先来介绍spider文件
    在start_urls选取了一本书作为入口,接着在parse函数里对详情页进行解析,获取需要的信息赋值到item里面,在最后使用yield scrapy.Request(url)把相似书籍的详情页url放入队列,等待下一次爬取,代码如下:

    import scrapy
    # from bs4 import BeautifulSoup
    from test_spider.items import TestSpiderItem
    import re
    
    class DmozSpider(scrapy.spiders.Spider):
        itemCount = 0
        name = "dbbook"
        allowed_domains = ["douban.com"]
        start_urls = []
        start_urls.append("https://book.douban.com/subject/27609047/")
        print(start_urls)
    
        def parse(self, response):
            book = TestSpiderItem()
            imgUrl = response.xpath("//div[@id='mainpic']/a[@class='nbg']/@href").extract_first()
            name = response.xpath("//span[@property='v:itemreviewed']/text()").extract_first()
            score = response.xpath("//strong[@property='v:average']/text()").extract_first().strip()
            label = response.xpath("//a[@class='  tag']/text()").extract()
            book['url'] = response.url
            book['label'] = label
            book['score'] = score
            book['imgUrl'] = imgUrl
            book['name'] = name
            infos = response.xpath("//div[@id='info']")
            curType = ""  # 当前获取的类型
            # print(infos.xpath("./*"))
            # print(infos.xpath("./text()"))
            if "作者" in infos.extract_first():
                author = infos.xpath(".//a/text()").extract_first().strip()
                book['author'] = self.getFormatStr(author)
            # print("作者:", infos.xpath(".//a/text()").extract_first().strip())
            for info in infos.xpath("./*|./text()"):
                name = info.xpath("text()").extract_first()
                if name is not None:
                    curType = ""
                # if "作者:" == name or "作者" == name:
                #     curType = "author"
                #     continue
                if "出版社:" == name:
                    curType = "press"
                    continue
                elif "出版年:" == name:
                    curType = "publishYear"
                    continue
                elif "页数:" == name:
                    curType = "pageCount"
                    continue
                elif "定价:" == name:
                    curType = "price"
                    continue
                elif "ISBN:" == name:
                    curType = "isbn"
                    continue
                elif "装帧:" == name:
                    curType = "binding"
                    continue
    
                span = info.extract()
                span = span.strip()  # 去掉空格
                span = span.replace("\n", "")  # 去掉换行符
                span = span.replace("<br>", "")  # 去掉换行符
                if len(span) != 0:
                    # if curType == "author":
                        # book['author'] = self.getFormatStr(info.xpath("text()").extract_first())  # 作者名字特殊一点
                    if curType == "press":
                        book['press'] = span
                    elif curType == "publishYear":
                        book['publishYear'] = span
                    elif curType == "pageCount":
                        book['pageCount'] = int(re.sub("\D", "", span))  #todo 这里限制只获取数字 去掉冒号 单位
                    elif curType == "price":
                        book['price'] =  float(re.findall(r"\d+\.?\d*",span)[0])
                    elif curType == "isbn":
                        book['isbn'] = span
                    elif curType == "binding":
                        book['binding'] = span
            yield book
            # 添加其他书到url列表
            similarUrls = response.xpath("//div[@id='db-rec-section']/div[@class='content clearfix']/dl/dt/a/@href").extract()
            for url in similarUrls:
                if self.itemCount < 10:
                    # self.itemCount += 1
                    yield scrapy.Request(url)
    
        def getFormatStr(self, params):
            params = params.strip()#去掉空格
            params = params.replace(" ", "")
            params = params.replace("\n" , "")#去掉换行符
            return params
    

    接着是存储,在pipeline里面把获取到的信息存入数据库,有些书籍会缺少某个字段,这里要动态判断,代码如下:

    # -*- coding: utf-8 -*-
    import pymysql
    from pymysql import connections
    class TestSpiderPipeline(object):
    
        def __init__(self):
            self.conn = pymysql.connect("localhost","root","root","douban_book", charset='utf8mb4')
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
            sql ="insert into book("
            placeHolder = ""
            selectKey = ""
            params=[]
            if 'url' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "URL"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['url'])
            if 'imgUrl' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "IMG_URL"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['imgUrl'])
            if 'author' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "AUTHOR"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['author'])
            if 'name' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "NAME"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['name'])
            if 'press' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "PRESS"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['press'])
            if 'score' in item and item['score'] != "":
                if selectKey != "":
                    selectKey += ","
                selectKey += "SCORE"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['score'])
            if 'pageCount' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "PAGE_COUNT"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['pageCount'])
            if 'price' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "PRICE"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['price'])
            if 'isbn' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "ISBN"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['isbn'])
            if 'publishYear' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "PUBLISH_YEAR"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['publishYear'])
            if 'binding' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "BINDING"
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(item['binding'])
            if 'label' in item:
                if selectKey != "":
                    selectKey += ","
                selectKey += "LABEL"
                label = ",".join(item['label'])
                if placeHolder != "":
                    placeHolder += ","
                placeHolder += "%s"
                params.append(label)
            sql += selectKey + ") VALUES(" + placeHolder+")"
            # sql ="insert into book(URL,IMG_URL,AUTHOR,NAME,PRESS,SCORE,PAGE_COUNT,PRICE,ISBN,PUBLISH_YEAR,BINDING,LABEL) VALUES(%s,%s,%s,%s,%s,%s,%d,%s,%s,%s,%s,%s)"
            # label = ",".join(item['label'])
            self.cursor.execute(sql, params)
            # self.cursor.execute(sql,(item['url'],item['imgUrl'],item['author'],item['name'],item['press'],item['score'],item['pageCount'],item['price'],item['isbn'],item['publishYear'],item['binding'],label))
            self.conn.commit()
            return item
    
        def close_spider(self,spider):
            self.conn.close()
    

    在设置3秒延时的情况下,爬了大概1000本书被封IP了。。。。淘宝买了个代理,每两分钟更换代理,于是又愉快的跑起来了,代理设置:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    import requests
    
    
    class TestSpiderSpiderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Response, dict
            # or Item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
        def process_request(self,request,spider):
            ip = self.get_proxy()
            print("使用代理ip: ", ip)
            request.meta['proxy'] = ip
    
        # 获取代理地址及端口
        def get_proxy(self):
            return requests.get("xxx").text
    

    爬了两天两夜代理IP时间到期了,就中断了


    image.png

    所有代码
    https://github.com/pkxutao/douban_book

    相关文章

      网友评论

          本文标题:scrapy爬取豆瓣读书

          本文链接:https://www.haomeiwen.com/subject/vapyaxtx.html