美文网首页爬虫实战
爬虫实战:scrapy爬取西米漫画的图片

爬虫实战:scrapy爬取西米漫画的图片

作者: 后山小鲨鱼 | 来源:发表于2020-04-07 17:49 被阅读0次

    这个网站的漫画是分等级的,有一些是要会员才可以看的,毕竟是限制级漫画-
    用钱买了个会员,开爬。
    和之前的爬虫不同的地方,这个要加cookie,这样才可以爬取会员章节的内容。
    getsimicomic.py

    # -*- coding: utf-8 -*-
    import scrapy
    from simicomic.items import SimicomicItem
    
    class GetsimicomicSpider(scrapy.Spider):
        name = 'getsimicomic'
        allowed_domains = ['simicomic.com/booklist']
        start_urls = ['http://simicomic.com/booklist/']
    
        def parse(self, response):
            bookNames = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-title"]/a/text()').extract()
            bookHrefs = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-title"]/a/@href').extract()
            bookChapters = response.xpath('//ul[@class="manga-list-2"]/li/p[@class="manga-list-2-tip"]/a/text()').extract()[0]
    
            i = 0
            for bookHref in bookHrefs:
                bookHref = 'http://simicomic.com'+bookHref
                bookName = bookNames[i]
                i = i + 1
                # 浏览器headers
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                    'Referfer':'http://simicomic.com/booklist'
                }
                # 指定cookies
                cookies = {
                    'book_referer':'http%3A%2F%2Fsimicomic.com%2Fbooklist',
                    'PHPSESSID':'ti662eu77k3t781gpo5vos67jj',
                    'nav_switch':'booklist',
                }
                yield scrapy.Request(url=bookHref, callback=lambda response, bookName=bookName,bookChapters = bookChapters: self.parse2(response, bookName,bookChapters), dont_filter=True,headers=headers,cookies=cookies)
    
            # nowPage = int(response.xpath('//ul[@class = "pagination"]/li[@class="active"]/span/text()').extract()[0])
            # if nowPage <= 9:
            #     nextPage = nowPage + 1
            #     nextPageUrl = 'http://simicomic.com/booklist/?tag=%E5%85%A8%E9%83%A8&area=-1&end=-1&page='+str(nextPage)
            #     yield scrapy.Request(url=bookHref, callback=self.parse)
    
        def parse2(self,response,bookName,bookChapters):
            chapters = response.xpath('//ul[@id="detail-list-select"]/li/a/@href').extract()
            author = response.xpath('//p[@class="detail-main-info-author"]/a/text()').extract()[0]
            describe = response.xpath('//p[@class="detail-main-info-author"]/text()').extract()[2]
            pages = response.xpath('//ul[@id="detail-list-select"]/li/a/@title').extract()
    
            i = 0
            for chapter in chapters:
                nowPage = pages[i]
                i = i + 1
                chapterHref =  'http://simicomic.com'+chapter
                # 浏览器headers
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                    'Referfer': 'http://simicomic.com'
                }
                # 指定cookies
                cookies = {
                    'PHPSESSID': 'ti662eu77k3t781gpo5vos67jj',
                    'nav_switch': 'booklist',
                }
                yield scrapy.Request(url=chapterHref, callback=lambda response, bookName=bookName,bookChapters=bookChapters,author=author,describe = describe,nowPage=nowPage: self.parse3(response, bookName,bookChapters,author,describe,nowPage), dont_filter=True,headers=headers,cookies=cookies)
    
        def parse3(self,response,bookName,bookChapters,author,describe,nowPage):
            img_urls = response.xpath('//div[@id="cp_img"]/div/img/@data-original').extract()
            for img_url in img_urls:
                item = SimicomicItem()
                item["bookName"] = bookName
                item["author"] = author
                item["myDescribe"] = describe
                item["bookChapters"] = bookChapters
                item["img_url"] = img_url
                item["nowPage"] = nowPage
                yield item
    
    

    item.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    class SimicomicItem(scrapy.Item):
        table = 'simicomic'
        id = scrapy.Field()
        bookName = scrapy.Field()
        author = scrapy.Field()
        myDescribe = scrapy.Field()
        bookChapters = scrapy.Field()
        img_url = scrapy.Field()
        nowPage = scrapy.Field()
    
    

    pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    from scrapy import Request
    from scrapy.utils.project import get_project_settings
    import time
    import random
    import os
    import shutil
    import pymysql
    
    class SimicomicPipeline(object):
        def process_item(self, item, spider):
            return item
    
    
    # 图片下载管道
    class ImagePipeline(ImagesPipeline):
        # 获取settings文件里设置的变量值
        IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
    
    
        def item_completed(self, results, item, info):
            image_path = [x['path'] for ok,x in results if ok]
            # 定义分类保存的路径
            img_path = "%s/%s/%s" % (self.IMAGES_STORE, item['bookName'],'第'+item['nowPage']+'章')
            # # 目录不存在则创建目录
            if os.path.exists(img_path) == False:
                os.makedirs(img_path)
    
            # 将文件从默认下路路径移动到指定路径下
            shutil.move(self.IMAGES_STORE + "/" + image_path[0],
                        img_path + "/" + item["img_url"].split('/')[-1])
            item['img_url'] = img_path + "/" + item["img_url"].split('/')[-1]
    
            if not image_path:
                raise DropItem('Image Download Failed')
            return item
    
    
    
        def get_media_requests(self, item, info):
            # 浏览器headers
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                'Referfer': 'http://simicomic.com'
            }
            # 指定cookies
            cookies = {
                'PHPSESSID': 'ti662eu77k3t781gpo5vos67jj',
                'nav_switch': 'booklist',
            }
            yield Request(item['img_url'], headers=headers,cookies = cookies)
    
    #mysql管道
    class MysqlPipeline():
        def __init__(self,host,database,user,password,port):
            self.host = host
            self.database = database
            self.user = user
            self.password = password
            self.port = port
    
        @classmethod
        def from_crawler(cls,crawler):
            return cls(
                host=crawler.settings.get('MYSQL_HOST'),
                database = crawler.settings.get('MYSQL_DATABASE'),
                user = crawler.settings.get('MYSQL_USER'),
                password = crawler.settings.get('MYSQL_PASSWORD'),
                port = crawler.settings.get('MYSQL_PORT'),
            )
    
        def open_spider(self,spider):
            self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf8',port = self.port)
            self.cursor = self.db.cursor()
    
        def close_spider(self,spider):
            self.db.close()
    
        def process_item(self,item,spider):
            data = dict(item)
            keys = ','.join(data.keys())
            values = ','.join(['%s']*len(data))
            # selectSql = 'select * from simicomic where img_url like (%s)' % (item["img_url"].split('/')[-1])
            # print(selectSql)
            # hasUrl = self.cursor.execute(selectSql)
            # print("hasUrl==>"+str(hasUrl))
            sql = 'insert into %s (%s) values (%s)' % (item.table,keys,values)
            self.cursor.execute(sql,tuple(data.values()))
            self.db.commit()
            return item
    

    setting.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for simicomic project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'simicomic'
    
    SPIDER_MODULES = ['simicomic.spiders']
    NEWSPIDER_MODULE = 'simicomic.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'simicomic (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    DEFAULT_REQUEST_HEADERS = {
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
    }
    
    
    #图片下载
    IMAGES_STORE = './images'
    # 爬取时间间隔
    DOWNLOAD_DELAY = 2
    ITEM_PIPELINES = {
        'simicomic.pipelines.ImagePipeline':300,
        'simicomic.pipelines.MysqlPipeline': 302,
    }
    
    
    
    #mysql配置
    MYSQL_HOST = 'localhost'
    MYSQL_DATABASE = 'scrapy_db'
    MYSQL_PORT = 3306
    MYSQL_USER = 'root'
    MYSQL_PASSWORD = 'fuhong888'
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    COOKIES_ENABLED = True
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'simicomic.middlewares.SimicomicSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'simicomic.middlewares.SimicomicDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    #ITEM_PIPELINES = {
    #    'simicomic.pipelines.SimicomicPipeline': 300,
    #}
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
    

    效果:


    相关文章

      网友评论

        本文标题:爬虫实战:scrapy爬取西米漫画的图片

        本文链接:https://www.haomeiwen.com/subject/ekjgphtx.html