花了半天时间,终于将爬虫写完了,中间遇到pipelines这块真的是一个坑点,希望也给大家出个避坑指南!
第一步新建项目
scrapy startproject wangzherongyao
scrapy genspider zhaopin zhipin.com
第二步
最重要的一步
分析页面
解析效果
爬虫代码
wangzhe.py
import json
import scrapy
from wangzherongyao.items import WangzherongyaoItem
class WangzheSpider(scrapy.Spider):
name = 'wangzhe'
allowed_domains = ['qq.com']
start_urls = ['https://pvp.qq.com/web201605/js/herolist.json']
skin_url = "https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg"
def parse(self, response):
dict_result = json.loads(response.text)
for res in dict_result:
ename = res["ename"]
cname = res["cname"]
skin_name = res.get('skin_name')
# print(ename, cname)
if skin_name:
skin_list = skin_name.split("|")
# print(skin_list)
for index, skinname in enumerate(skin_list):
full_skin_url = self.skin_url.format(ename, ename, index+1)
item = WangzherongyaoItem()
item["ename"] = ename
item["cname"] = cname
item["pic"] = full_skin_url
item["skinname"] = skinname
yield item
item文件
items.py
import scrapy
class WangzherongyaoItem(scrapy.Item):
ename = scrapy.Field()
cname = scrapy.Field()
pic = scrapy.Field()
skinname = scrapy.Field()
pipelines
文件需要重写三个方法
pipelines.py
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings
import scrapy
IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
class WangZheImagePipeline(ImagesPipeline):
# 重写了发送图片请求
def get_media_requests(self, item, info):
print(item["pic"])
yield scrapy.Request(url=item["pic"],meta={'item':item,'index':item['pic'].index(item["pic"])})
# 重写 item_completed方法
def item_completed(self, results, item, info):
print(results)
image_paths = [x['path'] for ok, x in results if ok]
# print(image_paths)
if not image_paths:
raise DropItem('Item contains no images')
# item['image_paths'] = image_paths
return item
# 重写 file_path 方法
def file_path(self, request, response=None, info=None):
item = request.meta['item'] # 通过上面的meta传递过来item
image_guid = item['cname'] + '_' + item['skinname'] + '.jpg'
print(image_guid)
# filename = '{0}/{1}'.format(folder_name, image_guid)
filename = image_guid
return filename
settings文件
IMAGES_STORE = 'file'
LOG_FILE = "wangzhe.log"
LOG_LEVEL = "DEBUG"
最后放一张成品图,哈哈哈~
image.png
网友评论