水平:下一页,不断获取,提取每页网址
垂直:根据每页网址垂直深化
爬虫代码:
import scrapy
import urllib
from douban_movie.items import DoubanMovieItem
class MovieSpider(scrapy.Spider):
# 爬虫名
name = 'movie'
# 起始url
start_urls = ['https://movie.douban.com/top250']
def parse(self, response):
# 水平抓取页面
next_page = response.xpath("//div/span[@class='next']/a/@href").extract_first()
# 存在下一页继续爬
if next_page is not None:
yield scrapy.Request(urllib.parse.urljoin(response.url, next_page))
# 垂直抓取页面内容
detail_url = response.xpath("//div[@class='item']/div/a/@href").extract()
for url in detail_url:
yield scrapy.Request(url,callback=self.parse_item)
def parse_item(self,response):
item = DoubanMovieItem()
item['name'] = response.xpath("//div[@id='content']/h1/span[1]/text()").extract_first()
item['stars'] = response.xpath("//div/strong[@class='ll rating_num']/text()").extract_first()
item['comment'] = response.xpath("//a[@class='rating_people']/span/text()").extract_first()
yield item
网友评论