scrapy 爬虫
目标把gank上的图片趴下来
镇楼图
2018-07-30.jpg 2018-08-16.jpg 2018-09-19.jpg// 初始化项目
scrapy startproject demo
修改items对象
import scrapy
import os
import requests
class GankItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
imageurl = scrapy.Field()
url = scrapy.Field()
pass
def canParse(self):
return self['name'] != '' and self['imageurl'] != ''
def downLoad(self):
filename = 'file'
files = self['url'].split("/")
if len(files) > 3:
filename = files[len(files) - 3] + "-" + files[len(files) - 2] + "-" + files[len(files) - 1]
suffix = "jpg"
data = self['imageurl'].split(".")
if len(data) >= 2:
suffix = data[len(data) - 1]
path = filename + "." + suffix
if not os.path.exists(path):
print(path)
with open(path, 'wb') as fp:
r = requests.get(self['imageurl'])
fp.write(r.content)
piplines
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class GankPipeline(object):
def process_item(self, item, spider):
if item.canParse():
item.downLoad()
pass
新建ganksprider
import scrapy
from demo.spiders.gank import GankItem
class GankSpider(scrapy.Spider,count=1):
name="gank"
allowed_domains = ["gank.io"]
start_urls=["https://gank.io/2018/10/22"]
def parse(self, response):
item=GankItem()
item['url'] = response.url
item['name']=response.xpath('//div[@class="container content"]/h1/text()').extract()[0]
item['imageurl']=response.xpath('//div[@class="container content"]/div[@class="outlink"]//p/img/@src').extract()[0]
yield item
newcontent =response.xpath('//div[@class="container content"]/div[@class="row"]/div[@class="six columns"]/p[@style="text-align: right"]/a/@href').extract_first()
if newcontent:
newurl="https://gank.io"+newcontent
print(newurl)
yield scrapy.Request(newurl, callback=self.parse)
修复setting 打开
ITEM_PIPELINES = {
'gank.pipelines.GankPipeline': 300,
}
就跑起来了
scrapy crawl xxx
年轻人注意身体
原文链接:https://blog.csdn.net/qq_22329521/article/details/83446096
网友评论