美文网首页
爬虫练手:crawl模板用法小例

爬虫练手:crawl模板用法小例

作者: BlueCat2016 | 来源:发表于2017-01-02 23:57 被阅读0次

    爬取目标:糗事百科全部文章的内容和网址。
    直接上代码:

    items.py

    #糗事百科自动抓取
    class QSBKAutoItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        content = scrapy.Field()
        link = scrapy.Field()
    

    爬虫文件

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from testscrapy01.items import QSBKAutoItem
    
    
    class QsbkautoSpider(CrawlSpider):
        name = 'qsbkauto'
        allowed_domains = ['qiushibaike.com']
        start_urls = ['http://qiushibaike.com/']
    
        rules = (
            #指定链接提取的规律 r代表后面引号里面的字符均为非转义
            #follow:是指爬取了之后,是否还继续从该页面提取链接,然后继续爬下去
            Rule(LinkExtractor(allow=r'article'), callback='parse_item', follow=True),
        )
    
        def parse_item(self, response):
            item = QSBKAutoItem()
            #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
            #i['name'] = response.xpath('//div[@id="name"]').extract()
            #i['description'] = response.xpath('//div[@id="description"]').extract()
            item["content"] = response.xpath("//div[@class='content']/text()").extract()[0]
            item["link"] = response.xpath("//link[@rel='canonical']/@href").extract()[0]
            print("内容:",item["content"])
            print("链接:",item["link"])
            print("*************************************************************")
            yield item
    

    对以上代码的改进:

    # coding=utf-8
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from testscrapy01.items import QSBKAutoItem
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class QsbkautoSpider(CrawlSpider):
        name = 'qsbkauto1'
        allowed_domains = ['qiushibaike.com']
        start_urls = ['http://www.qiushibaike.com/']
    
        rules = (
    
            Rule(LinkExtractor(allow=r'page/\d*/?s=\d*'), follow=True),
            Rule(LinkExtractor(allow=(r'article/.*')), callback='parse_item',follow=True),
        )
    
        def parse_item(self, response):
            item = QSBKAutoItem()
            item["content"] = response.xpath("//div[@class='content']/text()").extract()[0]
            # item["link"] = response.xpath("//link[@rel='canonical']/@href").extract()[0]
            item["link"] = response.url
            content = "内容:" + item["content"]
            print(content.decode("utf-8"))
            link = "链接:" + item["link"]
            print(link.decode("utf-8"))
            print("*************************************************************")
            yield item
    
    
    

    相关文章

      网友评论

          本文标题:爬虫练手:crawl模板用法小例

          本文链接:https://www.haomeiwen.com/subject/twjgvttx.html