美文网首页
scrapy 爬虫

scrapy 爬虫

作者: dongshangtong | 来源:发表于2019-06-06 09:00 被阅读0次

    scrapy 官方文档

    scrapy的信号量文档使用

    scrapy的扩展文档使用

    scrapy统计数据收集

    安装:

    pip install scrapy
    

    创建项目

    scrapy startproject qiubai
    

    在新项目中创建一个新的spider文件

     scrapy genspider example example.com
    
    # 例如:
    scrapy genspider quibaiDemo  qiushibaike.com
    

    然后在settings.py 修改下面两个配置

    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    

    在quibaiDemo里的 parse方法面写

     def parse(self, response):
    
            div_list = response.xpath('//div[@id="content-left"]/div')
            all_data = []
    
            for div in div_list:
                # title = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()')[0].extract()
                title = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first()
                # // *[ @ id = "qiushi_tag_121884912"] / a
                content = div.xpath('./a[1]/div/span/text()').extract_first()
    
                dic = {
                    'title': title,
                    'content': content
                }
    
                all_data.append(dic)
            return all_data
    

    在终端写运行命令

    #  例如
    scrapy crawl quibaiDemo
    
    #  不显示日志
    scrapy crawl quibaiDemo  --nolog
    
    # 输出到 以json文件
    scrapy crawl quibaiDemo -o qiubai.json
    
    # 输出到 以csv文件
    scrapy crawl quibaiDemo -o qiubai.csv
    

    使用items.py 和pipelines.py ,下面抓取boss 直聘网

    先在items.py编辑

    class BossproItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        salary = scrapy.Field()
        company = scrapy.Field()
    

    再在编辑pipelines.py

    class BossproPipeline(object):
        fp = None
    
        # 只会被执行一次(开始爬虫的时候执行一次)
        def open_spider(self, spider):
            print('开始爬虫!!!')
            self.fp = open('./job.txt', 'w', encoding='utf-8')
    
        # 爬虫文件没提交一次item,该方法会被调用一次
        def process_item(self, item, spider):
            self.fp.write(item['title'] + "\t" + item['salary'] + '\t' + item['company'] + '\n')
            return item
    
        def close_spider(self, spider):
            print('爬虫结束!!!')
            self.fp.close()
    
    

    在boss.py 文件

        def parse(self, response):
            li_list = response.xpath('//div[@class="job-list"]/ul/li')
    
            for li in li_list:
                title = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/div/text()').extract_first()
                salary = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/span/text()').extract_first()
                company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first()
    
                # 实例化一个item类型的对象
                item = BossproItem()
                # 将解析到的数据值存储到item对象中:why?
                item['title'] = title
                item['salary'] = salary
                item['company'] = company
    
                # 将item对象提交给管道进行持久化存储
                yield item
    

    还记得在seting.py 打开下面,不然没有效果。

    数字300表示的是优先级,数字越小优先级越高

    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'bossPro.pipelines.BossproPipeline': 300, # 300表示的是优先级,数字越小优先级越高
    }
    

    常用的几个设置

    # 日志级别
    LOG_LEVEL = 'ERROR'
    # 重定向
    RETRY_ENABLED = False
    # 下载超时
    DOWNLOAD_TIMEOUT = 3
    

    我们可以针对spider 文件自定义一些配置

     custom_settings = {
            "COOKIES_ENABLED": True
            "RETRY_ENABLED "= False
               往下填写继续......
        }
    

    如果要继承CrawlSpider 就可以自动翻页

    scrapy genspider -t crawl  crawlDemo  www.xxx.com
    

    原始作用:将起始url列表中的url进行get请求的发送.
    通过如下操作进行父类方法的重写,让其进行post请求的发送

        def start_requests(self):
            data = {
                'kw':'dog'
            }
            for url in self.start_urls:
                yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
    
    

    经常设置的中间件

    class ProxyproDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
        #拦截请求:request参数就是拦截到的请求
        proxy_http = ['http://209.34.29.9:8181', 'http://209.34.29.9:8181', 'http://209.34.29.9:8181']
        proxy_https = ['https://119.59.84.58:8080', 'https://119.59.84.58:8080','https://119.59.84.58:8080']
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        def process_request(self, request, spider):
            print('下载中间件',request)
            
            if request.url.split(':')[0] == 'http':
                request.meta['proxy'] = random.choice(self.proxy_http)
            else:
                request.meta['proxy'] = random.choice(self.proxy_https)
                
                
            request.headers['User-Agent'] =  random.choice(self.user_agent_list)
            print(request.headers['User-Agent'])
            return None
    

    使用scrapy-redis 分布试爬虫

    安装:

    pip install scrapy-redis
    

    我们爬取抽屉为例

    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy_redis.spiders import RedisCrawlSpider
    from choutiPro.items import ChoutiproItem
    
    class ChoutiSpider(RedisCrawlSpider):
        name = 'chouti'
        # allowed_domains = ['www.chouti.com']
        # start_urls = ['http://www.chouti.com/']
    
        redis_key = 'chouti'
        rules = (
            Rule(LinkExtractor(allow=r'/r/scoff/hot/\d+'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            div_list = response.xpath('//div[@id="content-list"]/div')
            for div in div_list:
                item = ChoutiproItem()
                item['title'] = div.xpath('.//div[@class="part1"]/a/text()').extract_first()
                item['author'] = div.xpath('.//div[@class="part2"]/a[4]/b/text()').extract_first()
    
                yield item
    
    

    在settings.py 设置以下

    
    # 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    # 使用scrapy-redis组件自己的调度器
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
    SCHEDULER_PERSIST = True
    
    ITEM_PIPELINES = {
        'scrapy_redis.pipelines.RedisPipeline': 400
    }
    
    REDIS_HOST = '127.0.0.1'
    REDIS_PORT = 6379
    
    

    登录redis,在终端

    lpush chouti https://dig.chouti.com/r/scoff/hot/1
    

    可以直接在终端请求摸个页面出来,在终端分析

    scrapy shell https://dig.chouti.com/r/scoff/hot/1
    
    # 添加 User-Agent   请求。
    scrapy shell -s  User-Agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36" https://dig.chouti.com/r/scoff/hot/1
    
    
    只出取出第一条信息:
    response.css(" .part1 a::text").extract_first()
    
    如果信息多条,取出多条信息:
    response.css(" .part1 a::text").extract()
    
    属性取值:
    response.css(" .part2::attr(share-title)").extract_first()
    

    UserAgent 就是用户代理,又叫报头,是一串字符串,相当于浏览器的身份证号
    在利用爬虫爬取网站数据时,频繁更换它可以避免触发相应的反爬机制
    fake-useragent 的GitHub地址
    fake-useragent0.1.11 版本对应的连接参数

    # 这个星比较多。
    pip install fake-useragent
    
    # 另一个useragent
    pip install scrapy-fake-useragent
    

    在数据库里随机取出一条代理IP。

    
    SELECT ip ,port FROM proxp_ip ORDER BY RAND() LIMIT 1
    

    使用列表中的随机代理处理Scrapy请求,以避免IP禁用并提高爬网速度。scrapy-proxies 的GitHub地址

    pip install scrapy_proxies
    
    

    scrapy-crawlera提供易于使用的Crawlera与Scrapy。
    可在在线获取文档,也可在docs目录中找到

    pip install scrapy-crawlera
    

    我们可以模仿这个写一个下载器

    让chrome 不显示

    pip install pyvirtualdisplay
    

    splinter 类似selenimu

    scrapy-redis 的GitHub

    分布式爬虫的优点

    1.充分利用多机器的宽带加速爬取
    2.充分利用多机的IP 加速爬取速度

    用scrapyd部署写好的scrapy的GitHub
    scrapyd官方文档

    安装

    pip install scrapyd
    
    pip install scrapyd-client
    

    自定义user-agent 的中间件,
    在配置里我们

    class RandomUserAgentMiddlware(object):
        #随机更换user-agent
        def __init__(self, crawler):
            super(RandomUserAgentMiddlware, self).__init__()
            self.ua = UserAgent()
            self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
    
        @classmethod
        def from_crawler(cls, crawler):
            return cls(crawler)
    
        def process_request(self, request, spider):
            def get_ua():
                return getattr(self.ua, self.ua_type)
    
            request.headers.setdefault('User-Agent', get_ua())
    

    在配置里的权重要比默认大,就可以覆盖默认user-agent

    DOWNLOADER_MIDDLEWARES = {
         'ArticleSpider.middlewares.JSPageMiddleware': 1,
         'ArticleSpider.middlewares.RandomUserAgentMiddlware': 543,
          'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
     }
    

    我们先爬取西祠要IP

    import requests
    from scrapy.selector import Selector
    import MySQLdb
    
    conn = MySQLdb.connect(host="127.0.0.1", user="root", passwd="root", db="article_spider", charset="utf8")
    cursor = conn.cursor()
    
    
    def crawl_ips():
        #爬取西刺的免费ip代理
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0"}
        for i in range(1568):
            re = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
    
            selector = Selector(text=re.text)
            all_trs = selector.css("#ip_list tr")
    
    
            ip_list = []
            for tr in all_trs[1:]:
                speed_str = tr.css(".bar::attr(title)").extract()[0]
                if speed_str:
                    speed = float(speed_str.split("秒")[0])
                all_texts = tr.css("td::text").extract()
    
                ip = all_texts[0]
                port = all_texts[1]
                proxy_type = all_texts[5]
    
                ip_list.append((ip, port, proxy_type, speed))
    
            for ip_info in ip_list:
                cursor.execute(
                    "insert proxy_ip(ip, port, speed, proxy_type) VALUES('{0}', '{1}', {2}, 'HTTP')".format(
                        ip_info[0], ip_info[1], ip_info[3]
                    )
                )
    
                conn.commit()
    
    
    class GetIP(object):
        def delete_ip(self, ip):
            #从数据库中删除无效的ip
            delete_sql = """
                delete from proxy_ip where ip='{0}'
            """.format(ip)
            cursor.execute(delete_sql)
            conn.commit()
            return True
    
        def judge_ip(self, ip, port):
            #判断ip是否可用
            http_url = "http://www.baidu.com"
            proxy_url = "http://{0}:{1}".format(ip, port)
            try:
                proxy_dict = {
                    "http":proxy_url,
                }
                response = requests.get(http_url, proxies=proxy_dict)
            except Exception as e:
                print ("invalid ip and port")
                self.delete_ip(ip)
                return False
            else:
                code = response.status_code
                if code >= 200 and code < 300:
                    print ("effective ip")
                    return True
                else:
                    print  ("invalid ip and port")
                    self.delete_ip(ip)
                    return False
    
    
        def get_random_ip(self):
            #从数据库中随机获取一个可用的ip
            random_sql = """
                  SELECT ip, port FROM proxy_ip
                ORDER BY RAND()
                LIMIT 1
                """
            result = cursor.execute(random_sql)
            for ip_info in cursor.fetchall():
                ip = ip_info[0]
                port = ip_info[1]
    
                judge_re = self.judge_ip(ip, port)
                if judge_re:
                    return "http://{0}:{1}".format(ip, port)
                else:
                    return self.get_random_ip()
    

    设置动态IP的中间件RandomProxyMiddleware

    class RandomProxyMiddleware(object):
        #动态设置ip代理
        def process_request(self, request, spider):
            get_ip = GetIP()
            request.meta["proxy"] = get_ip.get_random_ip()
    

    scrapy 集成selenium

    现在spider文件初始化selenium

    from scrapy.xlib.pydispatch import dispatcher
    from scrapy import signals
    
    class JobboleSpider(scrapy.Spider):
        name = "jobbole"
        allowed_domains = ["blog.jobbole.com"]
        start_urls = ['http://blog.jobbole.com/all-posts/']
    
    
        def __init__(self):
            self.browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
            super(JobboleSpider, self).__init__()
            
         # scrapy  信号量使用 , 发送爬虫 停止的时候,关闭selenium.
            dispatcher.connect(self.spider_closed, signals.spider_closed)
    
        def spider_closed(self, spider):
            #当爬虫退出的时候关闭chrome
            print ("spider closed")
            self.browser.quit()
    

    在中间Middleware 自定义JSPageMiddleware
    记得在seting文件设置 'ArticleSpider.middlewares.JSPageMiddleware': 1,
    权重方面要设置最小,它就小调用。

    from scrapy.http import HtmlResponse
    class JSPageMiddleware(object):
    
        #通过chrome请求动态网页
        def process_request(self, request, spider):
            if spider.name == "jobbole":
                # browser = webdriver.Chrome(executable_path="D:/Temp/chromedriver.exe")
                spider.browser.get(request.url)
                import time
                time.sleep(3)
                print ("访问:{0}".format(request.url))
    
                return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
    
    

    相关文章

      网友评论

          本文标题:scrapy 爬虫

          本文链接:https://www.haomeiwen.com/subject/ipgexctx.html