scrapy爬虫框架
qsbk.py 爬虫代码
import scrapy
'''
scrapy框架
爬虫流程:发送请求获取网站响应,提取数据数据解析,数据存储mongodb/redis 反反爬虫*更换ip代理/添加浏览器请求头* 异步请求
scrapy爬虫框架把基础的东西封装好了之后,直接写爬虫数据和保存数据更加高效和提高开发效率
响应文本中response.xpath从网址中获取数据返回的都是列表
get 的效果等同于 extract_first都是获取满足条件的数据列表中的第一个
get_all 的效果等同于 extract 都是获取满足条件的数据列表中的所有数据
'''
class QsbkSpider(scrapy.Spider):
name ='qsbk'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
def parse(self, response):
cross_talks=response.xpath('//div[@class="col1 old-style-col1"]/div')
cross_talk_list=[]
for cross_talkin cross_talks:
cross_talk_dict={}
img=cross_talk.xpath('.//div/a/img/@src').get()
if imgis not None:
img='http:'+img
cross_talk_dict['img']=img
author=''.join(cross_talk.xpath('.//a/h2/text()').get().strip())if ''.join(cross_talk.xpath('.//a/h2/text()').get().strip())is not None else None
cross_talk_dict['author']=author
age=cross_talk.xpath('.//div[@class="author clearfix"]/div[@class="articleGender womenIcon"]/text()')
cross_talk_dict['gender']=age
gender=cross_talk.xpath('.//div[@class="author clearfix"]/div/@class').get().strip()
gender=gender.replace('articleGender','').replace('Icon','')
cross_talk_dict['gender']=gender
content=''.join(cross_talk.xpath('.//div[@class="content"]/span/text()').getall()).strip()
cross_talk_dict['content']=content
laugh=cross_talk.xpath('./div[@class="stats"]/span[@class="stats-vote"]/i[@class="number"]/text()').get()
cross_talk_dict['laugh']=laugh
comments=cross_talk.xpath('./div[@class="stats"]//span[@class="stats-comments"]/a/i/text()').get()
cross_talk_dict['comments']=comments
cross_talk_list.append(cross_talk_dict)
print(cross_talk_list)
return cross_talk_list
pipelines.py 爬虫数据存储文件
from itemadapterimport ItemAdapter
import json
class QsbkPipeline:
def __init__(self):
print('==========QsbkPipeLine.__init__(self)==========')
self.f=open('qsbk_scrapy.json','w',encoding='utf-8')
def open_spider(self,spider):
print('==========open_spider(self,spider)==========')
def process_item(self, item, spider):
'''
当从爬虫中传递过来一个对象就调用一次当前方法
:paramitem:爬虫传递过来的数据对象
:paramspider:传递爬取的数据的爬虫
:return:
'''
#编码*json.dumps是将python对象转化为json字符串
#解码* json.loads是将json字符串转化为python对象
json_str=json.dumps(item,ensure_ascii=False)
self.f.write(json_str+'\n')
print('==========QsbkPipeline.process_item(self,item={},spider={})=========='.format(item,spider))
return item
def close_spider(self,spider):
self.f.close()
print('==========close_spider(self,spider)==========')
settings.py 配置文件
# Scrapy settings for my_scrapy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#指定爬虫机器人的名字 默认项目名称
BOT_NAME ='my_scrapy'
#指定爬虫模块的列表路径
SPIDER_MODULES = ['my_scrapy.spiders']
#指定新建的爬虫模块名称
NEWSPIDER_MODULE ='my_scrapy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'my_scrapy (+http://www.yourdomain.com)'
# Obey robots.txt rules
#默认设置为True意思是当前爬虫遵循robots协议 设置为False
ROBOTSTXT_OBEY =False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#指定同时可以发送的多个请求的最多请求数为32 默认为16
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#指定爬取时延迟的秒数
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#指定在同一域名下允许发送的最大请求数为16
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#指定在统一ip地址中允许发送的最大请求数
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#是否启用cookie 默认情况下是启用
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#是否禁用远程控制台
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#指定网址的请求头 requests UA伪装浏览器爬取数据不会被反爬
DEFAULT_REQUEST_HEADERS = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#是否启用爬虫中间件
#SPIDER_MIDDLEWARES = {
# 'my_scrapy.middlewares.MyScrapySpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#是否启用下载中间件
#DOWNLOADER_MIDDLEWARES = {
# 'my_scrapy.middlewares.MyScrapyDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#配置ItemPipeLine item是获取爬取的具体数据将item传送到pipeline中
#key是编写爬虫管道存储的完整路径
#value是一个权重值 代表优先级 数字越小优先级高优先经过pipeline就会先执行
ITEM_PIPELINES = {
'my_scrapy.pipelines.QsbkPipeline':300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
start.py 运行爬虫程序的命令
from scrapyimport cmdline
#split是将字符串转化为列表格式 两种命令同样的效果
cmd='scrapy crawl qsbk'.split()
cmdline.execute(cmd)
#不用在terminal控制台输入爬虫命令 直接右键运行即可
# cmdline.execute(['scrapy crawl qsbk'])
网友评论