本文的示例代码参考scrapy-tutorial
目录
Project
python --version
# Python 3.5.2
pip install scrapy
scrapy version
# Scrapy 1.5.1
scrapy startproject tutorial && cd tutorial
gi python >> .gitignore
touch README.md
Spider
scrapy genspider zhgcloud zhgcloud.com
vim tutorial/spiders/zhgcloud.py
# -*- coding: utf-8 -*-
import scrapy
class ZhgcloudSpider(scrapy.Spider):
name = 'zhgcloud'
allowed_domains = ['zhgcloud.com']
start_urls = ['https://www.zhgcloud.com/source/']
def parse(self, response):
print(response.body)
- 抓取
scrapy crawl zhgcloud
Selector
Selector是scarpy提取数据的一套实现 支持XPath、CSS以及正则表达式
scrapy shell https://www.zhgcloud.com/source/
body = scrapy.Selector(text=response.body)
posts = body.xpath('//div[@class="el-card post"]')
# len(posts) = 10
post = posts[0]
# <Selector xpath='//div[@class="el-card post"]' data=u'<div class="el-card post" data-v-2fd17dd'>
links = post.xpath('//a[@class="title"]/@href').extract()
print(links[0])
# /source/51
titles = post.xpath('//a[@class="title"]/text()').extract()
print(titles[0])
# 智慧公路将出现,未来将不需要加油站、收费站、驾驶员,一场新的大变革正在到来!
descriptions = post.xpath('//div[@class="description"]/text()').extract()
print(descriptions[0])
# 也许在不远的将来,100万名加油站工人都将失业;所有收费站的收费员都会下岗;所有司机都将被无人驾驶机器人所取代。
XPath即XML路径语言(XML Path Language) 它是一种用来确定XML文档中某部分位置的语言 关于XPath详细语法可以参考XPath
Parse
vim tutorial/spiders/zhgcloud.py
# -*- coding: utf-8 -*-
import scrapy
class ZhgcloudSpider(scrapy.Spider):
name = 'zhgcloud'
allowed_domains = ['zhgcloud.com']
start_urls = ['https://www.zhgcloud.com/source/']
def parse(self, response):
body = scrapy.Selector(text=response.body)
posts = body.xpath('//div[@class="el-card post"]')
post = posts[0]
links = post.xpath('//a[@class="title"]/@href').extract()
titles = post.xpath('//a[@class="title"]/text()').extract()
descriptions = post.xpath(
'//div[@class="description"]/text()').extract()
for i in range(len(posts)):
post_item = {}
post_item['link'] = links[i].encode('utf-8')
post_item['title'] = titles[i].encode('utf-8').strip()
post_item['description'] = descriptions[i].encode('utf-8').strip()
print(post_item['link'])
print(post_item['title'])
print(post_item['description'])
- 抓取
scrapy crawl zhgcloud
Item
vim tutorial/items.py
# 省略了未修改的代码
class PostItem(scrapy.Item):
link = scrapy.Field()
title = scrapy.Field()
description = scrapy.Field()
vim tutorial/spiders/zhgcloud.py
# -*- coding: utf-8 -*-
import scrapy
from tutorial.items import PostItem
class ZhgcloudSpider(scrapy.Spider):
# 省略了未修改的代码
def parse(self, response):
# 省略了未修改的代码
for i in range(len(posts)):
post_item = PostItem()
post_item['link'] = links[i].encode('utf-8')
post_item['title'] = titles[i].encode('utf-8').strip()
post_item['description'] = descriptions[i].encode('utf-8').strip()
yield post_item
- 抓取
scrapy crawl zhgcloud -o posts.json
echo "\nFEED_EXPORT_ENCODING = 'utf-8'" >> tutorial/settings.py
scrapy crawl zhgcloud -o posts.json
Item对象是种简单的容器 保存了爬取到得数据 其提供了类似于词典Dictionary-Like的API以及用于声明可用字段的简单语法 关于Item详细说明可以参考Items
Pipeline
docker run --name scrapy-mongo -p 27017:27017 -d mongo
pip install pymongo
vim tutorial/pipelines.py
# -*- coding: utf-8 -*-
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB'))
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
vim tutorial/settings.py
# 省略了未修改的代码
= 'localhost:27017'
MONGO_DB = 'tutorial'
ITEM_PIPELINES = {
'tutorial.pipelines.MongoPipeline': 300,
}
- 抓取
scrapy crawl zhgcloud
关于PyMongo更多可以参考PyMongo Tutorial
网友评论