安装python
推荐python2.7,测试过python3.7并不能使用
更新:最新版本v1.6 已支持python3.4+
安装Scrapy
pip install scrapy
mac在安装的时候会报错 six package 无法安装,可以通过直接下源码导入解决
新建Scrapy项目
scrapy startproject demo01
认识Scrapy项目结构
demo01 =>项目根目录
│ ├── demo01 =>主功能模块
│ │ ├── spiders =>爬虫处理逻辑
│ │ │ ├── init.py =>
│ │ │ ├── demo01Spider.py =>手动或者通过CLI建立的文件
│ │ ├── items.py =>存储爬取数据的容器
│ │ ├── middlewares.py =>中间件
│ │ ├── pipelines.py =>管道?
│ │ ├── settings.py =>设定文件
│ ├── scrapy.cfg =>全局配置文件
定义Item
import scrapy
class demo01Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 标题
title = scrapy.Field()
# 链接
link = scrapy.Field()
# 作者
author = scrapy.Field()
# 备注
memo = scrapy.Field()
爬虫主逻辑spider
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from jianshu.items import JianshuItem
# 继承CrawlSpider
class Demo01Spider(CrawlSpider):
# model 名
name = "jianshu"‘
# 初始访问url
start_urls = ['https://www.jianshu.com/trending/monthly']
url = 'https://www.jianshu.com'
# 重写父类方法
def parse(self, response):
item = JianshuItem()
selector = Selector(response)
articles = selector.xpath('//ul[@class="note-list"]/li')
for article in articles:
title = article.xpath('div[@class="content"]/a/text()').extract()
print(title)
link = article.xpath('a[@class="wrap-img"]/@href').extract()
author = article.xpath(
'div[@class="content"]/div/a[@class="nickname"]/text()').extract()
memo = article.xpath('div[@class="content"]/p/text()').extract()
item['title'] = title
item['link'] = link
item['author'] = author
item['memo'] = memo
# 将数据推到item
yield item
Setting文件中设置文件导出位置
# FEED 配置文件导出
FEED_URI='/Users/gsp/Documents/jianshu-hot.csv'
FEED_FORMAT='CSV'
启动程序
scrapy crawl demo01
错误处理
这时候运行结果出错,提示
http status code is not handled or allowed
解决方法如下:
#在setting文件中添加
HTTPERROR_ALLOWED_CODES = [404, 403]
再次运行命令
scrapy crawl demo01
依然出错,logo提示显示被服务器拒绝,这是目标网站反爬策略导致的,处理方式为在请求时添加随机请求头:
1、在setting文件中声明请求头
# 随机请求头
USER_AGENT_LIST=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
2、在setting文件中启用中间件
# 添加自定义中间件及禁用默认中间件
DOWNLOADER_MIDDLEWARES = {
'demo01.middlewares.Demo01SpiderMiddleware': 403,
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
}
3、修改middlewares文件,添加请求头
from demo01.settings import USER_AGENT_LIST
import random
class Demo01SpiderMiddleware(object):
···
def process_request(self, request, spider):
ua = random.choice(USER_AGENT_LIST)
if ua:
request.headers.setdefault('User-Agent', ua)
再次启动
scrapy crawl demo01
![](https://img.haomeiwen.com/i8953921/57aaa49610417512.png)
网友评论