环境安装
pip install scrapy
测试在终端里录入scrapy指令,没有报错即表示安装成功!
-
创建一个工程:scrapy startproject xxxPro
image.png -
在spiders子目录他是爬虫的文件夹,在其中那个创建一个爬虫文件
- scrapy genspider spiderName www.xxx.com
- cd xxxPro切换到项目目录下即可进行创建爬虫文件默认会将该爬虫文件放入spiders目录中
- settings.py配置文件
-
执行工程
- scrapy crawl spiderName
import scrapy
class QiubaiSpider(scrapy.Spider):
name = 'qiubai' # 爬虫文件的名称,就是爬虫源文件的唯一标识
# allowed_domains = ['www.qiubai.com'] # 允许的域名,用于限定start_urls那些URL那些可以进行请求发送(一般可以不用注释即可)
start_urls = ['http://www.baidu.com/', 'https://www.sogou.com/'] # 起始的URL列表,该列表中存放的URL会被scrapy自动的请求发送
# 用作于数据解析response表示的就是请求成功后的响应对象 parse执行的次数由start_urls的个数决定
def parse(self, response):
print(response,"-----")
,该执行会带你很多log如果只想看自己的可以是使用
scrapy crawl qiubai --nolog ,不打印日志的形式输出(不推荐)
推荐:在配置文件setings.py中设置:LOG_LEVEL='ERROR'
只显示error的错误日志
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
scrapy数据解析
xpath返回的是一个列表,但列表元素一定是Selector类型的对象
extract可以将Seletor对象中的data参数存储的字符串提取出来
列表调用了extract之后,则表示将列表中每个selector对象中data对应字符串提取出来了
列表转换字符串方法ss="".join(list)
extract_first()将列表中第0个进行extract操作,将Seletor列表如果有一个列表元素建议使用;
scrapy持久化存储
- 基于终端指令,要求:只可以将parse方法的返回值存储到本地的文本文件中,
注意只能存储为:'json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle' 使用parse需要return 你要存储的参数即可;
指令: scrapy crawl picd -o ./qiubai.csv
好处:简介高效便捷
缺点:局限性比较强,(数据只能存储到指定后缀 的文本文件中)
import scrapy
class PicdSpider(scrapy.Spider):
name = 'picd'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://funletu.com/dong-tai/page/2']
def parse(self, response):
all_data=[]
aList = response.xpath('//div[@class="s-a-c-l"]/h2/a')
# 因为该数据最后有三个动态去除的话就是可以是使用少循环 range(len(aList-3))即可实现
for a in aList:
title=a.xpath('./text()').extract_first()
path=a.xpath('./@href').extract_first()
print(title,path)
dic={'title':title,'path':path}
all_data.append(dic)
return all_data
-
基于管道,(常用)
编码流程,1.数据解析,2. 再item定义相关属性,3.解析的数据封装存储到item类型的对象,4.将item类型的对象提交给管道进行持久化存储的操作5.在管道类的process_item中要将其接收到的item对象中存储的数据进行持久化存储操作;6.在配置文件中开启管道
好处:通用性强.
image.png
image.png
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PicsoproPipeline(object):
fp = None
def open_spider(self, spider):
print('开始-start')
self.fp = open('./qiubai.txt', 'w', encoding='utf-8')
# 专门用来处理item类型对象
# 该方法可以接受爬虫文件提交过来的item对象
# 该方法接受到一个item就会被调用一次
def process_item(self, item, spider):
titl = item['title']
path = item['path']
self.fp.write(titl + "---" + path + '\n')
return item # 就会传递给下一个即将被执行的管道类
def close_spider(self, spider):
print('结束-close')
self.fp.close()
# 管道文件中一个管道对应将一组数据存储到一个平台或者载体中
class mysqlPileLine(object):
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='192.168.101.11', port=3306, user='yanchenglong', password='a123456', charset='utf8',
db='day17')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into pachong values(null,"%s","%s")' % (item['title'], item['path']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
image.png
面试题,将爬取到的数据一份存储到本地,一份存储到数据库,如何实现?
- 管道文件中一个管道类对应的是将数据存储到一种平台
- 爬虫文件提交的item只会给管道中第一个被执行的管道类接受
- 在process_item的return item表示将item传递给下一下即将执行的管道类的item
基于scrapy全站数据爬取
解释:就是将某板块下的全部页码对应的页面数据进行爬取
需求将图片网站的所有页码的图片和名称进行爬取
实现方式
- 将所有页面的url添加到start_urls列表(不推荐)
- 自行手动进行请求发送(推荐)
手动请求发送:yield scrapy.Request(url,callback):callback专门用坐于数据解析
import scrapy
class ExamplexhsjSpider(scrapy.Spider):
name = 'exampleXHSJ'
# allowed_domains = ['www.cheaa.org']
start_urls = ['https://www.cheaa.org/channels/116.html']
# 生成一个通用的url模板
url='https://www.cheaa.org/channels/116_%d.html'
page_number=2
def parse(self, response):
li_list=response.xpath('//table[@class="title_list"]/tr/td/a')
for li in li_list:
names=li.xpath('.//text()').extract_first()
link=li.xpath('./@href').extract_first()
print(names+"\t"+link)
if self.page_number<=3:
new_rul=format(self.url%self.page_number)
self.page_number+=1
# 如何进行手动发送请求,callback回调函数
yield scrapy.Request(url=new_rul,callback=self.parse)
- 五大核心组件
引擎(Scrapy)
用来处理整个系统的数据流处理,触发事务(框架核心),Scrapy生成请求对象,发送给调度器,
调度器(Scheduler)
用来接受引擎发过来的请求,压入队列中,并在引擎再次请求的时候返回,可以想象成一个url(抓取网页数据)由两部分组成,第一个是过滤器,第二个是队列
下载器(Downloader)
用于下载网页内容,并将网页内容返回给蜘蛛(scrapy)下载器建立在twisted这个高效的异步模型上的
爬虫(spiders)
爬虫主要干活的,用于从特定的网页中提取自己需要的信息,所谓的实体(Item)用户也可也i从中体现
项目管道(Pipeline)
负责处理爬虫从网页中抽取的实体,主要的功能时持久化实体,验证实体的有效性,清楚不需要的信息当 - 请求传参
使用场景:如果爬取解析的数据不在同一张页面中.(深度爬取)
需求
import scrapy
from bpsPro.items import BpsproItem
# from scrapyDemo.bpsPro.bpsPro.items import BpsproItem
class BpsSpider(scrapy.Spider):
name = 'bps'
# allowed_domains = ['sousuo.gov.cn']
start_urls = ['http://sousuo.gov.cn/column/30214/1.htm']
url='http://sousuo.gov.cn/column/30214/%d.htm'
page_num=2
# 解析首页的name的
def parse(self, response):
li_list=response.xpath('//ul[@class="listTxt"]/li/h4')
for li in li_list:
name=li.xpath('./a//text()').extract_first()
# print(name)
item=BpsproItem()
item['name']=name
datal_ulr=li.xpath('./a/@href').extract_first()
# 发起详情页的请求
# 请求传参meta传入字典就会传递给对应的回调函数
yield scrapy.Request(datal_ulr,callback=self.parse_content,meta={'item':item})
# 分页操作
if self.page_num<=3:
new_url=format(self.url%self.page_num)
self.page_num+=1
yield scrapy.Request(new_url,self.parse)
def parse_content(self,response):
content=response.xpath('//div[@id="UCAP-CONTENT"]/p[1]//text()').extract_first()
item=response.meta['item']
item['content']=content
yield item
- 图片数据爬取之ImagesPipeline
只需要将img的src的属性值进行解析,提交到管道,管道就会对图片的src进行请求发送获取图片的二进制类型的数据,且还会帮我们进行持久化存储 - 使用流程
- 数据解析(图片的地址)
- 将存储的图片地址的item提交给指定的管道类
- 在管道文件中自定义一个基于ImagesPipeLine的一个管道类
- 重写管道类方法
在使用过程中如果没有生成文件夹可以尝试安装一下
注:我们在items.py中定义的image_url字段是scrapy提供给我们的,scrapy框架对它接受的值的格式也做了限定,只能接受一个列表类型的值
如果出现了No module named 'PIL’错误,说明没有安装pillow,pip install pillow即可
- imgPro.py文件内容
import scrapy
import re
from imgsPro.items import ImgsproItem
class ImgproSpider(scrapy.Spider):
name = 'imgPro'
# allowed_domains = ['n.monster']
start_urls = ['https://vpton.monster/comic/2497']
def parse(self, response):
li_list=response.xpath('//ul[@class="mh-list col7"]/li')
for li in li_list:
name = li.xpath('./div/a/@title').extract_first()
src=li.xpath('./div/a/p/@style').extract_first()
# background-image: url(https://atic/upload/book/2488/cb5d29544077691b4c68b502caa096fe.jpg)
# 使用正则表达式将src中的地址抽取出来
ex_content = 'url\((.*?)\)' # 查找数字\将字符进行转义
content=re.findall(ex_content,src,re.M)
item=ImgsproItem()
item['src']=content[0]
yield item
pass
- items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ImgsproItem(scrapy.Item):
# define the fields for your item here like:
src = scrapy.Field()
pass
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import scrapy
from itemadapter import ItemAdapter
# class ImgsproPipeline:
# def process_item(self, item, spider):
# return item
from scrapy.pipelines.images import ImagesPipeline
class ImgPileLine(ImagesPipeline):
# 根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item['src'],meta={'item':item})
def file_path(self, request, response=None, info=None, *, item=None):
imgName=request.url.split('/')[-1]
return imgName
def item_completed(self, results, item, info):
return item # 将item返回给下一个即将被执行的管道类和process_item一致
- settings.py
主要设置
# 自定义使用的管道类
ITEM_PIPELINES = {
'imgsPro.pipelines.ImgPileLine': 1,
}
# 指定图片存储的目录
IMAGES_STORE = "images"
中间件
- 引擎和下载器中间存在一个中间件,称为下载中间件,
- 引擎和Spider中间存在一个中间件,称为爬虫中间件
下载中间件
位置:引擎和下载器之间,作用:批量拦截到整个工程中发起的所有的请求和响应
拦截请求:UA伪装process_request,代理IP:process_exception:return request
拦截响应:篡改响应数据,响应对象
需求怕取网易新闻中心的新闻数据(标题和内容)
- 通过网易新闻的首页解析出五大板块对应的详情页的url(没有动态加载)
- 每一个板块对应这样新闻标题都是动态加载出来的(动态加载)
- 通过解析出每一条详情页的url获取详情页的源码,解析出新闻内容
在middlewares.py中进行配置中间件
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ZzjproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ZzjproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
User_Agent_List = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
"Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
]
PROXY_http=[
'182.61.201.201:80'
]
PROXY_https=[
'150.136.178.43:80',
'216.21.18.194:80'
]
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# 拦截所有请求
def process_request(self, request, spider):
# 伪装UA
request.headers['User-Agent']=random.choice(self.User_Agent_List)
# 为了验证代理是否生效
request.meta['proxy']='http://'+'182.61.201.201:80'
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
# 拦截所有响应
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
# 连接发生异常的请求
def process_exception(self, request, exception, spider):
if request.url.split(':')[0]=='http':
request.meta['proxy'] = 'http://'+random.choice(self.PROXY_http)
# 代理
else:
request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
return request # 将修正的请求对象进行重新发送
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
settings.py
# Scrapy settings for zzjPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'zzjPro'
SPIDER_MODULES = ['zzjPro.spiders']
NEWSPIDER_MODULE = 'zzjPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zzjPro.middlewares.ZzjproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'zzjPro.middlewares.ZzjproDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'zzjPro.pipelines.ZzjproPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
练习
爬取网易新闻的数据
- 通过网易新闻的首页爬取五大板块对应的详情URL
- 没有个板块对应的新闻条目都是动态加载出来的
- 通过解析每一条目的新闻获取详情页的url解析出新闻内容
爬虫类
wangPro.py
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
class WangproSpider(scrapy.Spider):
# 实例化一个爬虫类
def __init__(self):
self.bro=webdriver.Edge(executable_path=r'C:\Users\sj176\Downloads\edgedriver_win64\msedgedriver.exe')
name = 'wangPro'
# allowed_domains = ['news.163.com']
start_urls = ['https://news.163.com/domestic/']
models_urls = [] # 存储详情页的url
# 解析五大板块的url
def parse(self, response):
li_list=response.xpath('//div[@class="ns_area list"]/ul/li')
all_list=[1,2,4,5]
for li in all_list:
mode_url=li_list[li].xpath('./a/@href').extract_first()
self.models_urls.append(mode_url)
# 依次对每个板块对应的页面进行请求
for url in self.models_urls:
yield scrapy.Request(url,callback=self.parse_model)
pass
# 每一个板块对应的新闻标题相关内容都是动态加载
def parse_model(self,response): # 解析详情页的url
dis_lis=response.xpath('//div[@class="data_row news_article clearfix "]/a')
for div in dis_lis:
title=div.xpath('./img/@alt').extract_first()
item=WangyiproItem()
item['title']=title
new_detail_url=div.xpath('./@href').extract_first()
print(new_detail_url)
yield scrapy.Request(url=new_detail_url,callback=self.parse_detail,meta={'item':item})
def parse_detail(self,response):
content=response.xpath('//*[@id="content"]/div[2]//text()').extract()
content=''.join(content)
item=response.meta['item']
item['content']=content
yield item
def closed(self,spider):
self.bro.quit()
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
content=scrapy.Field()
pass
middlewares.py
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
import time
class WangyiproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class WangyiproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
# 通过改方法拦截五大板块对应的相应对象,进行篡改
def process_response(self, request, response, spider):
# 拦截的是所有的相应对象,只处理五大板块的相应数据,进行挑选
if request.url in spider.models_urls:
# 基于selenium边界的获取动态加载数据
spider.bro.get(request.url) # 队伍看板块进行发送请求
time.sleep(2)
pageText=spider.bro.page_source
# with open('./dd.html','w',encoding='utf-8')as wf:
# wf.write(pageText)
new_response=HtmlResponse(url=request.url,body=pageText,encoding='utf-8',request=request)
return new_response
else:
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class WangyiproPipeline:
def process_item(self, item, spider):
print(item)
return item
settings.py
# Scrapy settings for wangyiPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wangyiPro'
SPIDER_MODULES = ['wangyiPro.spiders']
NEWSPIDER_MODULE = 'wangyiPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wangyiPro.middlewares.WangyiproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wangyiPro.pipelines.WangyiproPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
CrawlSpider类Spider的一个子类
- 全站数据怕取方式:1. 基于Spider手动请求,2.基于CrawlSpider
- CrawlSpider的使用
- 创建一个工程,cdxxx
- 创建爬虫文件(CrawlSpider):scrapy genspider -t crawl sun www.xxx.com
- 连接提取器,作用根据指定的规则(allow)进行指定连接的提取
- 规则解析器,作用将提取器提到的连接进行制定规则(CALLBACK)的解析
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://d.news.sun0769.com/hotline/review.asp?page=1']
# 连接提取器,格局指定规则(allow="正则")进行指定连接的提取 ? 正则表达式会起作用,使用/将他进行转义即可
link=LinkExtractor(allow=r'review.asp\?page=\d+')
rules = (
# 规则解析器,将连接提取器提取到的连接进行指定规则解析(callback)的解析操作
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response)
需求:爬取sun网站中的编号新闻标题,新闻内容,称号
- 分析:爬取的数据不在 同一页面中.
- 可以使用连接提取器,提取所有页码连接
sum.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ygPro.items import YgproItem,DetailItem
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://d.news.sun0769.com/hotline/review.asp?page=1']
# 连接提取器,格局指定规则(allow="正则")进行指定连接的提取 ? 正则表达式会起作用,使用/将他进行转义即可
link=LinkExtractor(allow=r'review.asp\?page=\d+')
link_detail=LinkExtractor(allow=r'/hotline/ygwz_\d+\.htm')
rules = (
# 规则解析器,将连接提取器提取到的连接进行指定规则解析(callback)的解析操作
Rule(link, callback='parse_item', follow=True),
# follow=True 可以将连接提取器继续作用到连接提取器提取到的页面中的连接抽取;会产生重复的URL但是没有关系,调度器当中存在过滤器,可以直接过滤重复的url请求;
Rule(link_detail,callback='parse_detail',follow=False)
)
# 如下两个解析方法是不可以实现请求传参的
# 可以一次存储到两个item中
def parse_item(self, response):
#http://news.sun0769.com/hotline/ygwz_462.htm
#http://news.sun0769.com/hotline/ygwz_461.htm
divs=response.xpath('//div[@class="school_photo1"]|//div[class="school_photo2"]')
for div in divs:
name=div.xpath('./div[@class="school_pci2"]/text()').extract_first()
item=YgproItem()
item['name']=name
yield item
# 解析新闻内容和新闻编号
def parse_detail(self,response):
new_id=response.xpath('//span[@class="txt16_b4"]/text()').extract_first()
print(new_id)
item=DetailItem()
item['content']=new_id
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class YgproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
pass
class DetailItem(scrapy.Item):
content = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class YgproPipeline:
def process_item(self, item, spider):
# 判断item类型
if item.__class__.__name__=='DetailItem':
print(item['content'])
else:
print(item['name'])
return item
settings.py
# Scrapy settings for ygPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'ygPro'
SPIDER_MODULES = ['ygPro.spiders']
NEWSPIDER_MODULE = 'ygPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
LOG_LEVEL = "ERROR"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'ygPro.middlewares.YgproSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'ygPro.middlewares.YgproDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ygPro.pipelines.YgproPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
分布式爬虫
概念:我们需要搭建分布式的集群让分布式的集群对一组资源进行分布联合爬取
作用:提升爬取数据的效率
如何实现分布式?
安装scrapy-redis的组件
原生的scrapy是不可以实现分布式爬虫的,我们必须要scrapy结和scrapy-redis组件一起实现分布式爬虫
为什么原生的scrapy不能实现分布式爬虫?
- 调度器不可被分布式机群共享
- 管道不可以被分布式集群共享
scrapy-redis组件的作用
- 可以让原生的scrapy矿建提供可以被共享的管道和调度器
实现流程
- 创建一个工程
- 基于一个CrawlSpider的爬虫文件
- 修改爬虫文件
- 导包from scrapy_redis.spiders import RedisCrawlSpider
- 将start_urls和allowed_domains进行注释,添加属性redis_key='sun' 可以被共享的调度器队列的名称
- 编写数据解析相关的操作
- 将父类修改为RedisCrawlSpider
- 修改配置文件settings
- 指定可以使用的共享的管道
# 指定scrapy_redis的管道,和调度器
ITEM_PIPELINES={
'scrapy_redis.pipelines.RedisPipeline':400
}
- 指定调度器
# 过滤器
DUPEFILTER_CLASS='scrapy_redis.dupefilter.RFPDupeFilter'
#使用scrapy_redis自己的调度器
SCHEDULER='scrapy_redis.scheduler.Scheduler'
#配置调度器是否要持久化,也就是当爬虫结束了,要不要清空Redis中请求队列和去重指纹的set如果是True的是要
SCHEDULER_PERSIST=True
指定redis服务器:
REDIS_HOST = 'redis服务的ip地址'
REDIS_PORT = 6379
- redis相关操作配置
配置redis的配置文件:
- linux或者mac:redis.conf
- windows:redis.windows.conf
- 代理配置文件修改 将bind 127.0.0.1进行注释,允许其他机器访问该redis
# IF YOU ARE SURE YOU WANT YOUR INSTANCE TO LISTEN TO ALL THE INTERFACES
# JUST COMMENT THE FOLLOWING LINE.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# bind 127.0.0.1
- 关闭保护模式(其他访问后可以修改该redis的值)
由 protected-mode yes修改为:protected-mode no
- 结合着配置文件开启redis服务
redis-server redis.windows.conf - 启动客户端
redis-cli
- 启动执行工程
scrapy runspider xxx.py - 想调度器的队列中放入一个起始的url:
调度器的队列在redis的客户端中
lpush sun http://d.news.sun0769.com/hotline/review.asp?page=1 - 爬取到的数据存储在了redis的proName:items这个数据结构中
执行过程中出现错误解决方案上方导报修改为
from collections.abc import Iterable
修改好了我再次执行结果还有错误
scrapy-reids 中提示‘AttributeError: 'DoubanSpider' object has no attribute 'make_requests_from_url'’
经过百度解决方案地址:(20条消息) scrapy-reids 中提示‘AttributeError: 'DoubanSpider' object has no attribute 'make_requests_from_url'’-有问必答-CSDN问答
在执行的scrapy的类中重写父类方法即可
# 必须重写该方法不然会默认调用系统的而系统的回因python版本不一样导致出现问题
def make_requests_from_url(self, url):
yield scrapy.Request(url=url)
项目地址:
fbs.py
import scrapy
from collections.abc import Iterable
from scrapy.linkextractors import LinkExtractor
import sys,os
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
from scrapy.spiders import CrawlSpider, Rule
from fbsPro.items import FbsproItem
from scrapy_redis.spiders import RedisCrawlSpider
# 第一步导报
class FbsSpider(RedisCrawlSpider):
name = 'fbs'
# allowed_domains = ['www.xxx.com']
# start_urls = ['http://www.xxx.com/']
# 第二部注释上方请求列表
redis_key = 'sun'
# 重写父类的属性,设置redis的key
rules = (
Rule(LinkExtractor(allow=r'review.asp\?page=\d+'), callback='parse_item', follow=True),
)
# lpush sun http://d.news.sun0769.com/hotline/review.asp?page=1
# 必须重写该方法不然会默认调用系统的而系统的回因python版本不一样导致出现问题
def make_requests_from_url(self, url):
yield scrapy.Request(url=url)
def parse_item(self, response):
# http://news.sun0769.com/hotline/ygwz_462.htm
# http://news.sun0769.com/hotline/ygwz_461.htm
divs = response.xpath('//div[@class="school_photo1"]|//div[class="school_photo2"]')
for div in divs:
name = div.xpath('./div[@class="school_pci3_2"]//text()').extract_first()
new_num=div.xpath('./div[@class="school_pci3_2"]/a/@href').extract_first()
item = FbsproItem()
item['name'] = name
item['new_num']=new_num
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class FbsproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
new_num=scrapy.Field()
pass
settings.py
# Scrapy settings for fbsPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'fbsPro'
SPIDER_MODULES = ['fbsPro.spiders']
NEWSPIDER_MODULE = 'fbsPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'fbsPro.middlewares.FbsproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'fbsPro.middlewares.FbsproDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'fbsPro.pipelines.FbsproPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 指定scrapy_redis的管道,和调度器
ITEM_PIPELINES={
'scrapy_redis.pipelines.RedisPipeline':400
}
# 指定调度器
# 过滤器
DUPEFILTER_CLASS='scrapy_redis.dupefilter.RFPDupeFilter'
#使用scrapy_redis自己的调度器
SCHEDULER='scrapy_redis.scheduler.Scheduler'
#配置调度器是否要持久化,也就是当爬虫结束了,要不要清空Redis中请求队列和去重指纹的set如果是True的是要
SCHEDULER_PERSIST=True
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
增量爬取
概念:检测网站数据更新的情况,只会爬取网站最新更新出来的数据.
分析:
- 指定一个起始url
- 基于CrawlSpider获取其他页码连接
- 基于Rule将其他页面连接进行请i去
- 从每一页页对应的页面源码中解析出每一个电影详情页的URL
- 核心:检测电影详情页的url之前有没有请求过,将爬取过的电影详情页的url存储
- 对没有爬取过的详情页url发起请求,然后解析出电影的名称和简介
- 进行持久化存储
演示代码
pbs.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from pbPro.items import PbproItem
class PbsSpider(CrawlSpider):
name = 'pbs'
# allowed_domains = ['www.pianba.tv']
start_urls = ['https://www.pianba.tv/class/6--------1---.html']
link = LinkExtractor(allow=r'/class/6--------(\d+)---.html')
rules = (
Rule(link, callback='parse_item', follow=False),
)
conn = Redis(host='127.0.0.1', port=6379)
def parse_item(self, response):
# 解析详情页的url
lis_li = response.xpath('//ul[@class="stui-vodlist clearfix"]/li')
for li in lis_li:
href = 'https://www.pianba.tv'+li.xpath('./div/a/@href').extract_first()
title = li.xpath('./div/a/@title')
item=PbproItem()
item['name']=title
# 将详情页存储到redis集合中set,进行去除
ex=self.conn.sadd('urls',href)
if ex==1:
print('该url库中没有爬取,可以进行数据爬取')
yield scrapy.Request(url=href,callback=self.parse_cont,meta={'item':item})
else:
print('无数据可爬取')
def parse_cont(self,response):
con_li=response.xpath('//div[@class="stui-content__detail"]//text()').extract()
item=response.meta['item']
item['content']=''.join(con_li)
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PbproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
content = scrapy.Field()
pass
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PbproPipeline:
conn=None
def open_spider(self,spider):
self.conn=spider.conn
def process_item(self, item, spider):
dic={
'name':item['name'],
'content':item['content']
}
self.conn.lpush('movieData',str(dic))
return item
网友评论