拉钩网全站爬取——crawlspider&itemloa

作者: 我叫钱小钱 | 来源:发表于2017-05-01 15:24 被阅读312次

    1. 引言

    最近一直在学习数理统计基础知识,趁着5.1必须狠狠地撸一撸爬虫代码

    以下代码基于Scrapy Spider的派生类Crawl Spiderde简单使用,及搭配ItemLoaderTwistedPipeline异步插入

    由于本文涉及知识太多,仅供交流与阅读,并不涉及反爬正则清洗等相关基础知识介绍,如有不明白的还请BAIDU查找关键字

    OK废话不多说,让我们进入实战阶段吧~

    2. 环境及创建crawl spider

    环境

    win10 / python3.5 / pycharm
    

    创建crawl spider

    >>cmd 
    # 标的拉勾网
    scrapy genspider -t crawl lagou www.lagou.com
    

    3. Crawl Spider Code

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider,Rule
    from article.items import LagouItemLoader,LagouItem
    from article.utils.common import hash_md5
    import datetime
    
    class LagouSpider(CrawlSpider):
      name ='lagou'
      allowed_domains = ['www.lagou.com']
      start_urls = ['https://www.lagou.com/']
      rules = (
      Rule(LinkExtractor(allow=(r'zhaopin/.*',)),follow=True),
      Rule(LinkExtractor(allow=(r'gongsi/j\d\.html',)),follow=True),
      Rule(LinkExtractor(allow=(r'jobs/.*',),
      restrict_css=("div#s_position_list ul.item_con_list"),),
      callback='parse_item',follow=False),
      )
    # LinkExtractor
    # 作用:response对象中获取链接,并且该链接会被接下来爬取
    # 使用:通过SmglLinkExtractor提取希望获取的链接
    # 主要参数:
    # allow:满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配;
    # 并跟进链接(没有callback意味着follow默认为True)
    # deny:与这个正则表达式(或正则表达式列表)
    # 不匹配的URL一定不提取
    # allow_domains:会被提取的链接的domains
    # deny_domains:一定不会被提取链接的domains
    # restrict_xpaths:使用xpath表达式,和allow共同作用过滤链接
    # 敲黑板!!!这里要注意了,当编写爬虫规则时,请避免使用parse作为回调函数
    # CrawlSpider使用parse方法来实现其逻辑,如果您覆盖了parse方法,crawlspider将会运行失败
    
    def parse_item(self,response):
      Item_loader = LagouItemLoader(item=LagouItem(),response=response)
    #在scrapy shell调试需要.extract()方法,而item_loader则不需要.extract()方法
      Item_loader.add_css("title","div.job-name::attr(title)")
      Item_loader.add_value("url",response.url)
      Item_loader.add_value("url_object_id",hash_md5(response.url))
      Item_loader.add_css("salary","span.salary::text")
      Item_loader.add_xpath("job_city",".//*[@class ='job_request']/p/span[2]/text()")
      Item_loader.add_xpath("work_years",".//*[@class ='job_request']/p/span[3]/text()")
      Item_loader.add_xpath("degree_need",".//*[@class ='job_request']/p/span[4]/text()")
      Item_loader.add_xpath("job_type",".//*[@class ='job_request']/p/span[5]/text()")
      Item_loader.add_css("tags","li.labels::text")
      Item_loader.add_css("publish_time","p.publish_time::text")
      Item_loader.add_css("job_advantage","dd.job-advantage p::text")
      Item_loader.add_css("job_desc","dd.job_bt div p::text")
      Item_loader.add_css("work_addr","div.work_addr")
      Item_loader.add_css("company_name","dl.job_company a img::attr(alt)")
      Item_loader.add_css("company_url","dl.job_company dt a::attr(href)")
      Item_loader.add_value("crawl_time",datetime.datetime.now())
    # itemloader只用来编写抓取逻辑,数据清晰放在items中进行
      lagou_item_loader= Item_loader.load_item()
    # 通常打断点打在这里,查看参数里的values,可查看到抓取的键值
      return lagou_item_loader
    

    3. ItemLoader Code

    import re
    import scrapy
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import MapCompose,TakeFirst,Join
    from w3lib.html import remove_tags
    from article.settings import SQL_DATETIME_FORMAT,SQL_DATE_FORMAT
    import datetime
    
    def ends_filter(value):
    #拉勾网清洗函数
      if "查看地图"in value:
        tp_list = value.split("\n")
        v_list = [v.strip()for v in tp_list if "查看地图"not in v]
        return" ".join(v_list).strip()
      elif "发布于拉勾网"in value:
        return value.replace("发布于拉勾网","").strip()
      elif "/"invalue:
        return value.replace("/","").strip()
      else:
        return value.strip()
    
    class LagouItemLoader(ItemLoader):
    # itemloader提取默认为list,所以这里需要重筑这个类的默认值
      default_output_processor = TakeFirst()
    
    class LagouItem(scrapy.Item):
    # 在scrapy.Field()里加入上面数据清洗的函数,可以接收多个函数
      title = scrapy.Field()
      url = scrapy.Field()
      url_object_id = scrapy.Field()
      salary = scrapy.Field(
      input_processor=MapCompose(ends_filter)
      )
      job_city = scrapy.Field(
      input_processor=MapCompose(ends_filter)
      )
      work_years = scrapy.Field(
      input_processor=MapCompose(ends_filter)
      )
      degree_need = scrapy.Field(
      input_processor=MapCompose(ends_filter)
      )
      job_type = scrapy.Field()
      tags = scrapy.Field(
      output_processor=Join(",")
      )
      publish_time = scrapy.Field(
      input_processor=MapCompose(ends_filter)
      )
      job_advantage = scrapy.Field(
      output_processor=Join("\n")
      )
      job_desc = scrapy.Field(
      output_processor=Join("\n")
      )
      work_addr = scrapy.Field(
      input_processor=MapCompose(remove_tags,ends_filter)
      )
      company_name = scrapy.Field()
      company_url = scrapy.Field()
      crawl_time = scrapy.Field()
    
      def insert_values(self):
        insert_sql ="""
        insert into lagou_job(url_object_id, title, url, salary, job_city, work_years,
        degree_need, job_type, publish_time, tags, job_advantage, job_desc, work_addr,
        company_url, company_name, crawl_time)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, )
        ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)
        """
        params= (
        self["url_object_id"],self["title"],self["url"],self["salary"],self["job_city"],
        self["work_years"],self["degree_need"],self["job_type"],self["publish_time"],
        self["tags"],self["job_advantage"],self["job_desc"],self["work_addr"],
        self["company_url"],self["company_name"],
        self["crawl_time"].strftime(SQL_DATETIME_FORMAT)
        )
    # 在此定义插入函数这样之后写的spider的pipelin就不需要去修改
        return insert_sql,params
    

    4. TwistedPipline Code

    class MysqlTwistedPipline(object):
    # 通用MYSQL_Pipline
      def__init__(self,dbpool):
        self.dbpool = dbpool
        @classmethod
        def from_settings(cls,settings):
    # 其中DB的参数在settings配置
        dbparms =dict(
        host= settings["MYSQL_HOST"],
        db= settings["MYSQL_DBNAME"],
        user= settings["MYSQL_USER"],
        passwd= settings["MYSQL_PASSWORD"],
        charset='utf8',
        cursorclass=MySQLdb.cursors.DictCursor,
        use_unicode=True,
        )
        dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
        return cls(dbpool)
    
      def process_item(self,item,spider):
    # 使用twisted将mysql插入变成异步执行
        query =self.dbpool.runInteraction(self.do_insert,item)
        query.addErrback(self.handle_error,item,spider) #处理异常
    
      def handle_error(self,failure,item,spider):
    # 处理异步插入的异常
        print(failure)
    
      def do_insert(self,cursor,item):
    #执行具体的插入
    #根据不同的item构建不同的sql语句并插入到mysql中
        insert_sql,params =item.insert_values()
        print(insert_sql,params)
        cursor.execute(insert_sql,params)
        return item
    

    最后附crawl spider源码解析链接:
    http://www.lai18.com/content/471040.htm

    相关文章

      网友评论

        本文标题:拉钩网全站爬取——crawlspider&itemloa

        本文链接:https://www.haomeiwen.com/subject/hboftxtx.html