昨天凌晨2点醒了看了下向右奔跑的文章,准备来个scrapy跨页面的数据爬取,以简书七日热门数据为例。
1 items.py代码
from scrapy.item import Item,Field
class SevendayItem(Item):
article_url = Field()#文章链接在首页爬取
author = Field()
article = Field()
date = Field()
word = Field()
view = Field()
comment = Field()
like = Field()
gain = Field()
可以看出,我要爬取的数据不在一个页面,这时候就需要跨页面爬取了。
2 新建sevendayspider.py
import scrapy
import sys
sys.path.append("..")
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
from sevenday.items import SevendayItem
import re
import json
import requests
class sevenday(CrawlSpider):
name = 'sevenday'
start_urls = ['http://www.jianshu.com/trending/weekly']
def parse(self, response):
selector = Selector(response)
infos = selector.xpath('//ul[@class="note-list"]/li')
for info in infos:
article_url_part = info.xpath('div/a/@href').extract()[0]
article_url = 'http://www.jianshu.com/' + article_url_part
yield Request(article_url, meta={'article_url':article_url},
callback=self.parse_item)
urls = ['http://www.jianshu.com/trending/weekly?page={}'.format(str(i)) for i in range(1, 11)]
for url in urls:
yield Request(url,callback=self.parse)
def parse_item(self,response):
item = SevendayItem()
item['article_url'] = response.meta['article_url']
selector = Selector(response)
author = selector.xpath('//span[@class="name"]/a/text()').extract()[0]
article = selector.xpath('//h1[@class="title"]/text()').extract()[0]
date = selector.xpath('//span[@class="publish-time"]/text()').extract()[0]
word = selector.xpath('//span[@class="wordage"]/text()').extract()[0]
view = re.findall(r'"views_count":(.*?),', response.body.decode('utf-8'), re.S)[0]
comment = re.findall(r'"comments_count":(.*?)}', response.body.decode('utf-8'), re.S)[0]
like = re.findall(r'"likes_count":(.*?),', response.body.decode('utf-8'), re.S)[0]
id = re.findall(r'{"id":(.*?),', response.body.decode('utf-8'), re.S)[0]
gain_url = 'http://www.jianshu.com/notes/{}/rewards?count=20'.format(id)
wb_data = requests.get(gain_url)
json_data = json.loads(wb_data.text)
gain = json_data['rewards_count']
item['author'] = author
item['article'] = article
item['date'] = date
item['word'] = word
item['view'] = view
item['comment'] = comment
item['like'] = like
item['gain'] = gain
yield item
看文章和我代码就能懂,我就班门弄斧了。
网友评论