美文网首页python_爬虫
Python(七十七)管道、日志与全站爬取

Python(七十七)管道、日志与全站爬取

作者: Lonelyroots | 来源:发表于2022-03-17 22:30 被阅读0次


    import scrapy
    import logging
    from myspider02.items import Myspider02Item, TaocheParamenterConfig
    
    logger = logging.getLogger(__name__)
    
    
    class TaocheSpider(scrapy.Spider):
        name = 'taoche'
        allowed_domains = ['taoche.com']
        start_urls = ['https://changsha.taoche.com/bmw/']
    
        # url模板
        url = 'https://changsha.taoche.com/bmw/?page=%d'
    
        count = 0
    
        def parse(self, response):
            max_page = response.xpath('//div[@class="paging-box the-pages"]/div/a[last()-1]/text()').extract_first()
            # logger.error(max_page)
            for page in range(1, int(max_page) + 1):
                new_url = self.url % page
                # 手动请求每一页,将url地址传递给调度器(但传的是请求对象)
                """
                    如何将请求传递给调度器
                        yield scrapy.Request()
                    url:请求地址
                    callback:请求后响应数据的处理函数
                    meta:传递数据
                        每次请求都会携带meta参数{'page':page}
                        传递给响应
                        response.meta = meta
                        response.meta['page']
                """
                yield scrapy.Request(url=new_url, callback=self.parse_taoche, meta={'page': page})
    
        def parse_taoche(self, response):
            # logger.error(f'{response.meta["page"]}')
            # 依次得到了每一页的汽车列表
            car_list = response.xpath('//div[@id="container_base"]/ul/li')
            for car in car_list:
                #     # count 用于测试
                #     self.count += 1
                # logger.error(self.count)
                CarFigure = car.xpath('./div[1]/div/a/img/@src').extract_first()
                Title = car.xpath('./div[2]/a/span/text()').extract_first()
                RegisterYear = car.xpath('./div[2]/p/i[1]/text()').extract_first()
                mileage = car.xpath('./div[2]/p/i[2]/text()').extract_first()
                city = car.xpath('./div[2]/p/i[3]/text()').extract_first().strip()
                selling_price = car.xpath('./div[2]/div[1]/i[1]/text()').extract_first()
                price = car.xpath('.//div[@class="price"]/i[2]/text()').extract_first()
    
                item = Myspider02Item()
                item['CarFigure'] = CarFigure
                item['Title'] = Title
                item['RegisterYear'] = RegisterYear
                item['mileage'] = mileage
                item['city'] = city
                item['selling_price'] = selling_price
                item['price'] = price
    
                # logger.error(item)
    
                # 获取每辆车详情页的url
                detail_url = car.xpath('./div[1]/div/a/@href').extract_first()
                yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
    
        def parse_detail(self, response):
            attrs = response.xpath('/html/body/div[9]/div[1]/div[2]/div[4]/div/dl[3]/dd/text()').extract_first()
            displacement, gearbox = tuple(attrs.split('/'))
    
            BrandModel = response.xpath('/html/body/div[9]/div[10]/div[2]/div[1]/ul/li[1]/span/a/text()').extract_first()
            SourceLocation = response.xpath('/html/body/div[9]/div[10]/div[2]/div[1]/ul/li[2]/span/text()').extract_first()
    
            taocheParamenterConfig = TaocheParamenterConfig()
            taocheParamenterConfig['displacement'] = displacement
            taocheParamenterConfig['gearbox'] = gearbox
            taocheParamenterConfig['BrandModel'] = BrandModel
            taocheParamenterConfig['SourceLocation'] = SourceLocation
    
            # 外键关联
            item = response.meta['item']
            item['detail'] = taocheParamenterConfig
    
            # logger.error(item)
    
            yield item
    

    16_管道、日志与全站爬取/myspider02/myspider02/items.py:

    import scrapy
    
    
    class Myspider02Item(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        CarFigure = scrapy.Field()
        Title = scrapy.Field()
        RegisterYear = scrapy.Field()
        mileage = scrapy.Field()
        city = scrapy.Field()
        selling_price = scrapy.Field()
        price = scrapy.Field()
        detail = scrapy.Field()
    
    
    class TaocheParamenterConfig(scrapy.Item):
        displacement = scrapy.Field()
        gearbox = scrapy.Field()
        BrandModel = scrapy.Field()
        SourceLocation = scrapy.Field()
    

    16_管道、日志与全站爬取/myspider02/myspider02/MyMongoDB.py:

    from pymongo import MongoClient
    
    
    class MyMongoDB:
        def __init__(self, database, collection):
            # 只要连接一次,千万不要放到循环里!!!!!!
            # MongoDB连接
            conn = MongoClient('localhost', 8881)  # 进入MongoDB这个软件
            db = conn[database]
            self.my_set = db[collection]
    
        def insert(self, data, onlyOne=True):
            if not isinstance(onlyOne, bool):
                raise TypeError
            self.my_set.insert_one(data) if onlyOne else self.my_set.insert_many(data)
    
        def find(self, query=None, onlyOne=True):
            if not isinstance(onlyOne, bool):
                raise TypeError
            self.my_set.find_one(query) if onlyOne else self.my_set.find(query)
    
        def update(self, data, new_data, onlyOne=True):
            if not isinstance(onlyOne, bool):
                raise TypeError
            self.my_set.update_one(data, {'$set': new_data}) if onlyOne else self.my_set.update_many(data,
                                                                                                     {'$set': new_data})
        def delete(self, data, onlyOne=True):
            if not isinstance(onlyOne, bool):
                raise TypeError
            self.my_set.delete_one(data) if onlyOne else self.my_set.delete_many(data)
    

    16_管道、日志与全站爬取/myspider02/myspider02/pipelines.py:

    from itemadapter import ItemAdapter
    from myspider02.MyMongoDB import MyMongoDB
    
    
    class Myspider02Pipeline:
        mongoDB = None
    
        def open_spider(self, spider):
            if spider.name == "taoche":
                print('开始爬取')
                self.mongoDB = MyMongoDB('taoche', 'car')
    
        def process_item(self, item, spider):
            if spider.name == "taoche":
                self.mongoDB.insert(dict(item))
            return item
    
        def close_spider(self, spider):
            if spider.name == "taoche":
                print('结束爬取')
    

    文章到这里就结束了!希望大家能多多支持Python(系列)!六个月带大家学会Python,私聊我,可以问关于本文章的问题!以后每天都会发布新的文章,喜欢的点点关注!一个陪伴你学习Python的新青年!不管多忙都会更新下去,一起加油!

    Editor:Lonelyroots

    相关文章

      网友评论

        本文标题:Python(七十七)管道、日志与全站爬取

        本文链接:https://www.haomeiwen.com/subject/hdpxdrtx.html