参考自:Web Scraping With Scrapy and MongoDB
0x00
采用了scrapy爬虫框架,爬取了StackOverflow的最新问题及问题的url。爬取的结果用mongodb存储。
0x01 定义Item
# -*- coding: utf-8 -*-
import scrapy
class QuestionItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
首先定义Item,我们只需要存储两个信息,所以像上面一样简单的定义两个成员变量就好了。scrapy.Field()可以理解成python的字典。
0x02 编写Spider
#!/usr/bin/env python3
#coding=utf-8
import scrapy
from stackOverflow.items import QuestionItem
class QuestionSpider(scrapy.Spider):
# spider name
name = "question"
# only scrape domain in allowed_domains
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://stackoverflow.com/questions?page=1&sort=newest"
]
def parse(self, response):
for question in response.xpath('//div[@class="summary"]/h3'):
item = QuestionItem()
item['title'] = question.xpath('a[@class="question-hyperlink"]/text()').extract_first()
item['url'] = question.xpath('a[@class="question-hyperlink"]/@href').extract_first()
yield item
for i in range(1,11):
next_page = "http://stackoverflow.com/questions?page=%s&sort=newest" % str(i)
yield scrapy.Request(next_page, callback=self.parse)
在由于我的电脑容量太小了太小了太小了!所以只爬取前10页意思一下。
0x03 将数据存入MongoDB
首先在settings.py中定义我们需要的一下东西。ITEM_PIPELINES中的INT数字从0-1000,表示调用时的优先级,数字越小优先级越高。
ITEM_PIPELINES = {
'stackOverflow.pipelines.MongoDBPipeline':300,
}
MONGO_URI = 'mongodb://localhost/'
MONGO_DATABASE = 'stackoverflow'
连接数据库,存入数据。
# -*- coding: utf-8 -*-
import pymongo
from scrapy.conf import settings
from scrapy import log
from scrapy.exceptions import DropItem
#class StackoverflowPipeline(object):
# def process_item(self, item, spider):
# return item
class MongoDBPipeline(object):
# that is the name of MongoDB Collection
collection_name = 'questions'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.db[self.collection_name].insert(dict(item))
log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider)
return item
0x04 运行MongoDB,运行爬虫
-
在一个终端中,输入
mongod
,运行MongoDB。 -
在另一个终端中运行我们的爬虫,
scrapy crawl question
,这里question是我的spider的名字。
==这里看到10页一共爬到了549条问题
网友评论