import scrapy
import logging
from myspider02.items import Myspider02Item, TaocheParamenterConfig
logger = logging.getLogger(__name__)
class TaocheSpider(scrapy.Spider):
name = 'taoche'
allowed_domains = ['taoche.com']
start_urls = ['https://changsha.taoche.com/bmw/']
# url模板
url = 'https://changsha.taoche.com/bmw/?page=%d'
count = 0
def parse(self, response):
max_page = response.xpath('//div[@class="paging-box the-pages"]/div/a[last()-1]/text()').extract_first()
# logger.error(max_page)
for page in range(1, int(max_page) + 1):
new_url = self.url % page
# 手动请求每一页,将url地址传递给调度器(但传的是请求对象)
"""
如何将请求传递给调度器
yield scrapy.Request()
url:请求地址
callback:请求后响应数据的处理函数
meta:传递数据
每次请求都会携带meta参数{'page':page}
传递给响应
response.meta = meta
response.meta['page']
"""
yield scrapy.Request(url=new_url, callback=self.parse_taoche, meta={'page': page})
def parse_taoche(self, response):
# logger.error(f'{response.meta["page"]}')
# 依次得到了每一页的汽车列表
car_list = response.xpath('//div[@id="container_base"]/ul/li')
for car in car_list:
# # count 用于测试
# self.count += 1
# logger.error(self.count)
CarFigure = car.xpath('./div[1]/div/a/img/@src').extract_first()
Title = car.xpath('./div[2]/a/span/text()').extract_first()
RegisterYear = car.xpath('./div[2]/p/i[1]/text()').extract_first()
mileage = car.xpath('./div[2]/p/i[2]/text()').extract_first()
city = car.xpath('./div[2]/p/i[3]/text()').extract_first().strip()
selling_price = car.xpath('./div[2]/div[1]/i[1]/text()').extract_first()
price = car.xpath('.//div[@class="price"]/i[2]/text()').extract_first()
item = Myspider02Item()
item['CarFigure'] = CarFigure
item['Title'] = Title
item['RegisterYear'] = RegisterYear
item['mileage'] = mileage
item['city'] = city
item['selling_price'] = selling_price
item['price'] = price
# logger.error(item)
# 获取每辆车详情页的url
detail_url = car.xpath('./div[1]/div/a/@href').extract_first()
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
def parse_detail(self, response):
attrs = response.xpath('/html/body/div[9]/div[1]/div[2]/div[4]/div/dl[3]/dd/text()').extract_first()
displacement, gearbox = tuple(attrs.split('/'))
BrandModel = response.xpath('/html/body/div[9]/div[10]/div[2]/div[1]/ul/li[1]/span/a/text()').extract_first()
SourceLocation = response.xpath('/html/body/div[9]/div[10]/div[2]/div[1]/ul/li[2]/span/text()').extract_first()
taocheParamenterConfig = TaocheParamenterConfig()
taocheParamenterConfig['displacement'] = displacement
taocheParamenterConfig['gearbox'] = gearbox
taocheParamenterConfig['BrandModel'] = BrandModel
taocheParamenterConfig['SourceLocation'] = SourceLocation
# 外键关联
item = response.meta['item']
item['detail'] = taocheParamenterConfig
# logger.error(item)
yield item
16_管道、日志与全站爬取/myspider02/myspider02/items.py:
import scrapy
class Myspider02Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
CarFigure = scrapy.Field()
Title = scrapy.Field()
RegisterYear = scrapy.Field()
mileage = scrapy.Field()
city = scrapy.Field()
selling_price = scrapy.Field()
price = scrapy.Field()
detail = scrapy.Field()
class TaocheParamenterConfig(scrapy.Item):
displacement = scrapy.Field()
gearbox = scrapy.Field()
BrandModel = scrapy.Field()
SourceLocation = scrapy.Field()
16_管道、日志与全站爬取/myspider02/myspider02/MyMongoDB.py:
from pymongo import MongoClient
class MyMongoDB:
def __init__(self, database, collection):
# 只要连接一次,千万不要放到循环里!!!!!!
# MongoDB连接
conn = MongoClient('localhost', 8881) # 进入MongoDB这个软件
db = conn[database]
self.my_set = db[collection]
def insert(self, data, onlyOne=True):
if not isinstance(onlyOne, bool):
raise TypeError
self.my_set.insert_one(data) if onlyOne else self.my_set.insert_many(data)
def find(self, query=None, onlyOne=True):
if not isinstance(onlyOne, bool):
raise TypeError
self.my_set.find_one(query) if onlyOne else self.my_set.find(query)
def update(self, data, new_data, onlyOne=True):
if not isinstance(onlyOne, bool):
raise TypeError
self.my_set.update_one(data, {'$set': new_data}) if onlyOne else self.my_set.update_many(data,
{'$set': new_data})
def delete(self, data, onlyOne=True):
if not isinstance(onlyOne, bool):
raise TypeError
self.my_set.delete_one(data) if onlyOne else self.my_set.delete_many(data)
16_管道、日志与全站爬取/myspider02/myspider02/pipelines.py:
from itemadapter import ItemAdapter
from myspider02.MyMongoDB import MyMongoDB
class Myspider02Pipeline:
mongoDB = None
def open_spider(self, spider):
if spider.name == "taoche":
print('开始爬取')
self.mongoDB = MyMongoDB('taoche', 'car')
def process_item(self, item, spider):
if spider.name == "taoche":
self.mongoDB.insert(dict(item))
return item
def close_spider(self, spider):
if spider.name == "taoche":
print('结束爬取')
文章到这里就结束了!希望大家能多多支持Python(系列)!六个月带大家学会Python,私聊我,可以问关于本文章的问题!以后每天都会发布新的文章,喜欢的点点关注!一个陪伴你学习Python的新青年!不管多忙都会更新下去,一起加油!
Editor:Lonelyroots
网友评论