源码摘记如下:
-- coding: utf-8 --
import scrapy
import re
from scrapy.http import Request
from urllib import parse
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["blog.jobbole.com/"]
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
"""
1.获取文章列表中的文章url并交给scrapy进行解析
2.递归下一页
:param response:
:return:
"""
# 获取文章列表中的文章url并交给scrapy进行解析
post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
for post_url in post_urls:
yield Request(url=parse.urljoin(response.url, post_url), callback=self.parse_detail, dont_filter=True)
# 提取下一页
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse, dont_filter=True)
pass
def parse_detail(self, response):
# 通过css选择器提取字段
title = response.css(".entry-header h1::text").extract()[0]
create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·", "").strip()
praise_nums = response.css(".vote-post-up h10::text").extract()[0]
fav_nums = response.css(".bookmark-btn::text").extract()[0]
match_re = re.match(".*?(\d+).*", fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0
comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
match_re = re.match(".*?(\d+).*", comment_nums)
if match_re:
comment_nums = int(match_re.group(1))
else:
comment_nums = 0
content = response.css("div.entry").extract()[0]
tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
tags = ",".join(tag_list)
pass
网友评论