美文网首页
scrapy翻页

scrapy翻页

作者: 楚糖的糖 | 来源:发表于2018-11-08 12:11 被阅读0次

    参考链接

    普通方式翻页1

    #coding:utf-8
    import scrapy
    from freebuf2.items import Freebuf2Item
    import time
    from scrapy.crawler import CrawlerProcess
    
    class freebuf2Spider(scrapy.Spider):
        name ='freebuf2'
        allowed_domains = []
    
        start_urls = ["http://www.freebuf.com/"]
    
        def parse(self, response):
    
            for link in response.xpath("//div[contains(@class, 'news_inner news-list')]/div/a/@href").extract():
    
    
                yield scrapy.Request(link, callback=self.parse_next)#这里不好理解的朋友,先去看看yield的用法。我是按协程(就是中断执行)理解的,感觉容易理解。
    
    
    
            next_url = response.xpath("//div[@class='news-more']/a/@href").extract()#找到下一个链接,也就是翻页。
    
    
    
            if next_url:
    
                yield scrapy.Request(next_url[0],callback=self.parse)
    
        def parse_next(self,response):
            item = Freebuf2Item()
            item['title'] = response.xpath("//h2/text()").extract()
            item['url'] = response.url
            item['date'] = response.xpath("//div[@class='property']/span[@class='time']/text()").extract()
            item['tags'] = response.xpath("//span[@class='tags']/a/text()").extract()
    
            yield item
    

    方法一:定义URLs超链接列表单分别爬取

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy import Request
    from scrapy.selector import Selector  
    from GZProject.items import *  
     
    class GZSpider(scrapy.Spider):
        name = "gznw"                #贵州农产品爬虫
        allowed_domains = ["gznw.gov.cn"]
        start_urls = [
         "http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=1",
         "http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=2",
         "http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=3"
        ]
     
        def parse(self, response): 
            print '----------------Start------------------'
            print response.url
     
            for sel in response.xpath('//tr[@class="odd gradeX"]'):
                item = GzprojectItem()
                num1 = sel.xpath('td[1]/text()').extract()[0]
                num2 = sel.xpath('td[2]/text()').extract()[0]
                num3 = sel.xpath('td[3]/text()').extract()[0]
                num4 = sel.xpath('td[4]/text()').extract()[0]
                num5 = sel.xpath('td[5]/text()').extract()[0]
                print num1,num2,num3,num4,num5
     
            print '\n'
    
    

    方法二:拼接不同网页URL并发送请求爬取

    import scrapy
    from scrapy import Request
    from scrapy.selector import Selector  
    from GZProject.items import *  
     
    class GZSpider(scrapy.Spider):
        name = "gznw"                #贵州农产品爬虫
        allowed_domains = ["gznw.gov.cn"]
        start_urls = [
         "http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=1"
        ]
     
        def parse(self, response): 
            print '----------------Start------------------'
            print response.url
     
            for sel in response.xpath('//tr[@class="odd gradeX"]'):
                item = GzprojectItem()
                num1 = sel.xpath('td[1]/text()').extract()[0]
                num2 = sel.xpath('td[2]/text()').extract()[0]
                num3 = sel.xpath('td[3]/text()').extract()[0]
                num4 = sel.xpath('td[4]/text()').extract()[0]
                num5 = sel.xpath('td[5]/text()').extract()[0]
                print num1,num2,num3,num4,num5
                item['num1'] = num1
                item['num2'] = num2
                item['num3'] = num3
                item['num4'] = num4
                item['num5'] = num5
                yield item
            print '\n'
     
            #循环换页爬取
            i = 2
            while i<=10:
                next_url = "http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page="+str(i)
                i = i + 1
                yield Request(next_url)
    --------------------- 
    作者:Eastmount 
    来源:CSDN 
    原文:https://blog.csdn.net/eastmount/article/details/79307675 
    

    方法三:获取下一页超链接请求爬取内容

    如果存在“下一页”超链接,则进行跳转爬取,如果“下一页”超链接为空,则停止爬取。核心代码如下:

    核心代码如下:
    
    next_url = response.xpath('//a[@class="page-link next"]/@href').extract()
    if next_url is not None:
        next_url = 'http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx' + next_url[0]
        yield Request(next_url, callback=self.parse)
    
    

    设置爬取网页的数量

    i = 0
    next_url = response.xpath('//a[@class="page=link next"]/@href').extract()
    if next_(url is not None) and i<20:
        i = i + 1
        next_url = 'http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx' + next_url[0]
        yield Request(next_url, callback=self.parse)
    
    

    相关文章

      网友评论

          本文标题:scrapy翻页

          本文链接:https://www.haomeiwen.com/subject/cgzixqtx.html