整个项目的路径结构
|-- my_scrapy_project
| |-- __init__.py
| |-- items.py
| |-- mongodb.py
| |-- pipelines.py
| |-- settings.py
| `-- spiders
| |-- __init__.py
| |-- test1.py
-- scrapy.cfg
首先修改settings配置文件,添加mongo配置:
ITEM_PIPELINES = ['my_scrapy_project.mongodb.MongoDBPipeline', ]
HOST = "127.0.0.1"
PORT = 27017
DB = "tt"
COLLECTION = "meiju"
将mongo详细配置写到mongodb.py文件中:
# coding:utf-8
import pymongo
from scrapy.exceptions import DropItem
from scrapy.conf import settings
from scrapy import log
HOST = settings["HOST"]
PORT = settings["PORT"]
DB = settings["DB"]
COLLECTION = settings["COLLECTION"]
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.Connection(
HOST,
PORT
)
db = connection[DB]
self.collection = db[COLLECTION]
def process_item(self, item, spider):
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Question added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
现在的test1项目文件:
class TtSpider(scrapy.Spider):
name = "stack"
allowed_domains = ["ttmeiju.com"]
start_urls = [
"http://www.ttmeiju.com/"
]
def parse(self, response):
for sel in response.xpath("//table[contains(@class,'seedtable')]/tr[contains(@class,'Scontent')]"):
item = TestItem()
title = sel.xpath('td[2]/a/text()').extract()
link = sel.xpath('td[2]/a/@href').extract()
download = sel.xpath('td[3]/a/@href').extract()
name = str(title[0])
item['title'] = name.decode("utf-8").replace("\n", "")
item['link'] = link
item['download'] = download
yield item
最后:
还是直接运行命令: scrapy crawl tt
即可将数据直接存储到mongo中去。
tt.jpg
网友评论