# -*- coding: utf-8-*-# 文本编辑器编码要设置对，最好为UTF-8无BOM编码importscrapyclassNum1Spider(scrapy.Spider): name ="num1"# 爬虫命名，在项目中有用 allowed_domains = ["jianshu.com"] # 允许爬取的域名 domain ='http://jianshu.com'# 自己设置的基础域名变量 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36", } # 请求头 base_url ='http://www.jianshu.com/collections/16/notes?order_by=added_at&page=%d'# 关于此处url，参看新手向爬虫（一） num =0# 页数 def start_requests(self): # 默认的开始函数，用于提供要爬取的链接 # url = self.base_url % self.num while self.num <4000: # 程序员专题总页数小于4000，共花费212.975027秒 self.num +=1yield scrapy.Request(self.base_url % self.num, headers = self.headers, callback = self.parse) def parse(self, response): # 默认的回调函数，用于链接下载完毕后调用来处理数据 for index,iinenumerate(response.css(".title a::text").extract()):if"爬虫"ini or"爬取"ini: like = response.css("a + span::text").extract()[index].replace(' · 喜欢 ','') url = self.domain + response.css('.title a::attr(href)').extract()[index] yield {"title": i,"like": like,"url": url}######################## Debug ############################### #fromscrapy.shellimportinspect_response# inspect_response(response, self)# 将以上两句插入回调函数中任意位置，即可在运行过程中中断打开交互命令行，用于调试查看响应内容######################## Run ############################### # scrapy runspider num1.py -o1.json

作者：treelake

链接：http://www.jianshu.com/p/dcd6438ce4c7

來源：简书

著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。

资料
Python爬虫系列（一）初期学习爬虫的拾遗与总结（11.4更） Python爬虫学习系列教程 Python爬虫学习手册
Python爬虫学习手册
爬虫文章 in 简书程序员专题： like:128-Python 爬取落网音乐 like:127-【图文详解】py...
Python爬虫学习（十六）初窥Scrapy
Python爬虫学习（一）概述Python爬虫学习（二）urllib基础使用Python爬虫学习（三）urllib...
Python学习-Scrapy爬虫专题
手册目的专门记录使用Scrapy爬虫学习过程中的各种坑IDE Anaconda，python 3.6 Scrap...
Python爬虫学习系列教程
转自: 静觅»Python爬虫学习系列教程 Python爬虫学习系列教程 Python版本：2.7 一、爬虫入门 ...
爬虫入门
为什么要学习爬虫？ Python做爬虫优势关于Python网络爬虫，我们需要学习的有：什么是爬虫？网络爬虫（...
Python爬虫学习之小结（一）
到目前为止，Python爬虫学习已经写了八篇文章，分别是： Python爬虫学习（一）概述Python爬虫学习（二...
python爬虫学习-day7-实战
目录 python爬虫学习-day1 python爬虫学习-day2正则表达式 python爬虫学习-day3-B...
Python 基础爬虫目录
目录 python爬虫学习-day1 python爬虫学习-day2正则表达式 python爬虫学习-day3-B...
python爬虫学习-day5-selenium
目录 python爬虫学习-day1 python爬虫学习-day2正则表达式 python爬虫学习-day3-B...