美文网首页
Scrapy爬虫实战 - 卡车

Scrapy爬虫实战 - 卡车

作者: 诺之林 | 来源:发表于2020-01-15 11:56 被阅读0次
    pipenv --version
    # pipenv, version 2018.10.13
    
    mkdir truck && cd truck
    
    vim Pipfile
    
    [[source]]
    url = "https://mirrors.aliyun.com/pypi/simple"
    verify_ssl = true
    name = "pypi"
    
    [requires]
    python_version = "3.7"
    
    [packages]
    scrapy = "*"
    
    [dev-packages]
    pylint = "*"
    "autopep8" = "*"
    
    gi python >> .gitignore
    
    pipenv install
    
    pipenv run scrapy version
    # Scrapy 1.8.0
    
    pipenv run scrapy startproject truck
    mv truck temp && mv temp/* . && rm -rf temp
    
    pipenv run scrapy genspider che che.com
    
    vim truck/spiders/che.py
    
    # -*- coding: utf-8 -*-
    
    import scrapy
    
    
    class CheSpider(scrapy.Spider):
        name = 'che'
        count = 0
        allowed_domains = ['*']
        base_url = "*"
    
        def start_requests(self):
            urls = [
                '*',  # 牵引车
                '*',  # 载货车
                '*',  # 自卸车
            ]
            for url in urls:
                yield scrapy.Request(url=url, callback=self.parse)
    
        def parse(self, response):
            body = scrapy.Selector(text=response.body)
            next_page = body.xpath('//a[contains(text(),"下一页")]/@href').extract()
            categories = body.xpath(
                '//div[@class="caption"]/h2/a/text()').extract()
            refs = body.xpath('//a[contains(text(),"配置")]/@href').extract()
            for i, ref in enumerate(refs):
                yield scrapy.Request(url=(self.base_url + ref.strip()), callback=self.parse_category, meta={'category': categories[i]})
            if next_page:
                yield scrapy.Request(url=(self.base_url + next_page[0]), callback=self.parse)
    
        def parse_category(self, response):
            body = scrapy.Selector(text=response.body)
            refs = body.xpath('//div[@class="title-bar"]/h5/a/@href').extract()
            for ref in refs:
                yield scrapy.Request(url=(self.base_url + ref.strip()),
                                     callback=self.parse_detail,
                                     meta={'category': response.meta.get('category')})
    
        def parse_detail(self, response):
            body = scrapy.Selector(text=response.body)
            name = body.xpath('//h1[@class="conttan_a_l"]/a/text()').extract()
            params = body.xpath(
                '//div[@class="sppic"][2]//td/div/text()').extract()
            info = {}
            for i, val in enumerate(params):
                if i % 2 == 0:
                    info[params[i].strip(':')] = params[i + 1]
            print(response.meta.get('category'))
            print(info)
            print(response.url)
            print(name[0])
            self.count = self.count + 1
            print(self.count)
    
    pipenv run scrapy crawl che
    
    vim README.md
    
    pipenv install
    
    pipenv run scrapy crawl che
    

    相关文章

      网友评论

          本文标题:Scrapy爬虫实战 - 卡车

          本文链接:https://www.haomeiwen.com/subject/qvksactx.html