美文网首页
scrapy爬取伯乐在线文章

scrapy爬取伯乐在线文章

作者: nice生活 | 来源:发表于2018-09-02 17:23 被阅读0次

    爬取伯乐在线文章相对来说是比较简单的,因为网站没有什么反爬取的措施,整站爬取我们可以使用广度优先算法和深度优先算法,scrapy使用的是后进先出队列,基本可以看成是深度优先。但伯乐在线提供了全部文章的页面,我们直接在这个页面爬取即可。

    开发python版本:python3.5
    开发工具pycharm

    首先创建虚拟环境,然后进入虚拟环境,通过pip3下载scrapy

    pip install scrapy
    

    通常我们只需执行上面即可,虽然scrapy有几个依赖库,但执行上面语句会自动下载的

    如果是win系统还需要下载pywin3

    pip install pywin3
    

    然后通过scrapy startproject blspider创建项目,再cd blspider,通过scrapy genspider bole http://blog.jobbole.com/all-posts/创建爬虫,最后通过pycharm打开我们的项目。

    在编写前需要在setting配置我们的开发环境

    选择创建的虚拟环境,开始编写爬虫。
    爬取内容有首页图片,文章标题,发布时间,标签,内容,点赞数,评论数。
    items.py

    import scrapy
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import TakeFirst, MapCompose, Join
    from datetime import datetime
    
    # 处理tags
    def tags_handle(value):
        if "评论" in value:
            return ""
        return value
    
    # 提取数字
    def num_handle(value):
        g = re.match(".*(\\d+).*", value)
        if g:
            value = int(g.group(1))
        else:
            value = 0
        return value
    
    # 时间处理
    def date_handle(value):
        try:
            date = datetime.strptime(value, "%Y/%m/%d").date()
        except Exception as e:
            date = datetime.now().date()
        return date
    
    # 不做处理
    def return_value(value):
        return value
    
    # 自定义ItemLoader
    class MyItemLoader(ItemLoader):
        default_output_processor = TakeFirst() #取第一个数据
    
    
    class BlspiderItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        date = scrapy.Field(
            input_processor=MapCompose(date_handle)
        )
        post_url = scrapy.Field()
        image_url = scrapy.Field(
            output_processor=MapCompose(return_value)
        )
        image_path = scrapy.Field()
        tags = scrapy.Field(
            input_processor=MapCompose(tags_handle),
            output_processor=Join(",")
        )
        content = scrapy.Field()
        favour = scrapy.Field(
            input_processor=MapCompose(num_handle)
        )
        comment = scrapy.Field(
            input_processor=MapCompose(num_handle)
        )
        # 生成sql
        def getinsertsql(self):
            sql = """
                     insert ignore into blsposts values (%s,%s,%s,%s,%s,%s,%s,%s,%s)
                  """
            params = (self["title"], self["date"], self["post_url"],
                      self.get("image_url"), self.get("image_path"), self["tags"],
                      self["content"], self["favour"], self["comment"])
            return sql, params
    

    bole.py

    import scrapy
    from scrapy.http import Request
    from urllib.parse import urljoin
    from blspider.items import MyItemLoader, BlspiderItem
    
    class JobboleSpider(scrapy.Spider):
        name = 'jobbole'
        allowed_domains = ['blog.jobbole.com']
        start_urls = ['http://blog.jobbole.com/all-posts/']
    
        def parse(self, response):
            post_nodes = response.css("#archive .post-thumb a")
            for node in post_nodes:
                post_url = node.css("::attr(href)").extract_first()
                image_url = node.css("img::attr(src)").extract_first()
                yield Request(url=urljoin(response.url, post_url), meta={"image_url": urljoin(response.url, image_url)},
                              callback=self.detail_parse)
            next_url = response.css("div.navigation a.next::attr(href)").extract_first()
            if next_url:
                yield Request(url=urljoin(response.url, next_url), callback=self.parse)
    
        def detail_parse(self, response):
            itemloader = MyItemLoader(item=BlspiderItem(), response=response)
            itemloader.add_css("title", ".entry-header h1::text")
            itemloader.add_css("date", ".entry-meta-hide-on-mobile::text")
            itemloader.add_value("post_url", response.url)
            itemloader.add_value("image_url", response.meta.get("image_url"))
            itemloader.add_css("tags", ".entry-meta-hide-on-mobile a::text")
            itemloader.add_css("content", ".entry")
            itemloader.add_css("favour", ".vote-post-up h10::text")
            itemloader.add_css("comment", ".post-adds a span::text")
            item = itemloader.load_item()
            if not item.get("favour"):
                item["favour"] = 0
            yield item
    
    

    pipelines.py

    from twisted.enterprise import adbapi
    import pymysql
    
    class BlspiderPipeline(object):
        def __init__(self, pool):
            self.dbpool = pool
    
        @classmethod
        def from_settings(cls, settings):
            dbparams = dict(
                host=settings["MYSQL_HOST"],
                database=settings["MYSQL_DB"],
                user=settings["MYSQL_USER"],
                password=settings["MYSQL_PW"],
                charset='utf8',
                cursorclass=pymysql.cursors.DictCursor,
                use_unicode=True
            )
            pool = adbapi.ConnectionPool("pymysql", **dbparams)
            return cls(pool)
    
        def process_item(self, item, spider):
            if item["image_path"]:
                item["image_path"] = item["image_path"][0]["path"]
            else:
                item["image_path"] = None
            query = self.dbpool.runInteraction(self.do_insert, item)
            query.addErrback(self.error_handle, item)
            return item
    
        def error_handle(self, error, item):
            print(error)
    
        def do_insert(self, cursor, item):
            sql,params = item.getinsertsql()
            cursor.execute(sql, params)
    

    setting.py

    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES = {
        'blspider.pipelines.BlspiderPipeline': 2,
       'scrapy.pipelines.images.ImagesPipeline': 1
    }
    
    IMAGES_URLS_FIELD = "image_url"
    import os
    root = os.path.abspath(os.path.dirname(__file__))
    IMAGES_STORE = os.path.join(root,"images")
    IMAGES_RESULT_FIELD = "image_path"
    MYSQL_HOST = "localhost"
    MYSQL_DB = "..."
    MYSQL_USER = "..."
    MYSQL_PW = "..."
    

    相关文章

      网友评论

          本文标题:scrapy爬取伯乐在线文章

          本文链接:https://www.haomeiwen.com/subject/jrxmwftx.html