从GitHub网站上下载下来大神的scrapy-redis项目(https://github.com/rmax/scrapy-redis),解压到本地,然后拿着大神的example-project随意模仿,将自己之前写的scrapy项目spider文件放到该文件夹下的spiders文件夹里。如下图所示
csdn.py文件修改前是这个样子滴:
# -*- coding: utf-8 -*-
import scrapy
import scdn.items
class CsdnSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['edu.csdn.net']
start_urls = ['https://edu.csdn.net/lecturer?&page=1']
def parse(self, response):
for pagedata in response.xpath("//dl[@class='lector_list']"):
item = scdn.items.ScdnItem()
item['teacher'] = pagedata.xpath("./dd[1]/ul/li/a/text()").extract()
item['lessons'] = pagedata.xpath("./dd[1]/ul/li[2]/span/text()").extract()
item['student'] = pagedata.xpath("./dd[1]/ul/li[3]/span/text()").extract()
item['describe'] = pagedata.xpath("./dd[1]/p/text()").extract()
yield item
for i in range(1,57):
url = "https://edu.csdn.net/lecturer?&page="+str(i)
yield scrapy.Request(url,self.parse)
稍作修改:
修改后的csdn.py文件是这个样子滴:
# -*- coding: utf-8 -*-
import scrapy
import example.items
class CsdnSpider(scrapy.Spider):
name = 'csdn'
allowed_domains = ['edu.csdn.net']
start_urls = ['https://edu.csdn.net/lecturer?&page=1']
def parse(self, response):
for pagedata in response.xpath("//dl[@class='lector_list']"):
item = example.items.CsdnItem()
item['teacher'] = pagedata.xpath("./dd[1]/ul/li/a/text()").extract()
item['lessons'] = pagedata.xpath("./dd[1]/ul/li[2]/span/text()").extract()
item['student'] = pagedata.xpath("./dd[1]/ul/li[3]/span/text()").extract()
item['describe'] = pagedata.xpath("./dd[1]/p/text()").extract()
yield item
for i in range(1,57):
url = "https://edu.csdn.net/lecturer?&page="+str(i)
yield scrapy.Request(url,self.parse)
借用的example中的items.py修改前是这个样子滴:
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
class ExampleItem(Item):
name = Field()
description = Field()
link = Field()
crawled = Field()
spider = Field()
url = Field()
class ExampleLoader(ItemLoader):
default_item_class = ExampleItem
default_input_processor = MapCompose(lambda s: s.strip())
default_output_processor = TakeFirst()
description_out = Join()
修改后的items.py文件时这个样子滴:
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
class CsdnItem(Item):
teacher = Field()
lessons = Field()
student = Field()
describe = Field()
crawled = Field() #什么时候抓取的
spider = Field() #谁抓取的
class ExampleItem(Item):
name = Field()
description = Field()
link = Field()
crawled = Field()
spider = Field()
url = Field()
class ExampleLoader(ItemLoader):
default_item_class = ExampleItem
default_input_processor = MapCompose(lambda s: s.strip())
default_output_processor = TakeFirst()
description_out = Join()
文件改好后,就可以执行了。在cmd.exe里切换至项目所在文件夹,然后执行“scrapy crawl csdn”,就能看到要爬的信息源源不断的爬取了下来。
网友评论