爬取伯乐在线文章相对来说是比较简单的,因为网站没有什么反爬取的措施,整站爬取我们可以使用广度优先算法和深度优先算法,scrapy使用的是后进先出队列,基本可以看成是深度优先。但伯乐在线提供了全部文章的页面,我们直接在这个页面爬取即可。
开发python版本:python3.5
开发工具pycharm
首先创建虚拟环境,然后进入虚拟环境,通过pip3下载scrapy
pip install scrapy
通常我们只需执行上面即可,虽然scrapy有几个依赖库,但执行上面语句会自动下载的
如果是win系统还需要下载pywin3
pip install pywin3
然后通过scrapy startproject blspider
创建项目,再cd blspider
,通过scrapy genspider bole http://blog.jobbole.com/all-posts/
创建爬虫,最后通过pycharm打开我们的项目。
在编写前需要在setting配置我们的开发环境

选择创建的虚拟环境,开始编写爬虫。
爬取内容有首页图片,文章标题,发布时间,标签,内容,点赞数,评论数。
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join
from datetime import datetime
# 处理tags
def tags_handle(value):
if "评论" in value:
return ""
return value
# 提取数字
def num_handle(value):
g = re.match(".*(\\d+).*", value)
if g:
value = int(g.group(1))
else:
value = 0
return value
# 时间处理
def date_handle(value):
try:
date = datetime.strptime(value, "%Y/%m/%d").date()
except Exception as e:
date = datetime.now().date()
return date
# 不做处理
def return_value(value):
return value
# 自定义ItemLoader
class MyItemLoader(ItemLoader):
default_output_processor = TakeFirst() #取第一个数据
class BlspiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
date = scrapy.Field(
input_processor=MapCompose(date_handle)
)
post_url = scrapy.Field()
image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)
image_path = scrapy.Field()
tags = scrapy.Field(
input_processor=MapCompose(tags_handle),
output_processor=Join(",")
)
content = scrapy.Field()
favour = scrapy.Field(
input_processor=MapCompose(num_handle)
)
comment = scrapy.Field(
input_processor=MapCompose(num_handle)
)
# 生成sql
def getinsertsql(self):
sql = """
insert ignore into blsposts values (%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
params = (self["title"], self["date"], self["post_url"],
self.get("image_url"), self.get("image_path"), self["tags"],
self["content"], self["favour"], self["comment"])
return sql, params
bole.py
import scrapy
from scrapy.http import Request
from urllib.parse import urljoin
from blspider.items import MyItemLoader, BlspiderItem
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
post_nodes = response.css("#archive .post-thumb a")
for node in post_nodes:
post_url = node.css("::attr(href)").extract_first()
image_url = node.css("img::attr(src)").extract_first()
yield Request(url=urljoin(response.url, post_url), meta={"image_url": urljoin(response.url, image_url)},
callback=self.detail_parse)
next_url = response.css("div.navigation a.next::attr(href)").extract_first()
if next_url:
yield Request(url=urljoin(response.url, next_url), callback=self.parse)
def detail_parse(self, response):
itemloader = MyItemLoader(item=BlspiderItem(), response=response)
itemloader.add_css("title", ".entry-header h1::text")
itemloader.add_css("date", ".entry-meta-hide-on-mobile::text")
itemloader.add_value("post_url", response.url)
itemloader.add_value("image_url", response.meta.get("image_url"))
itemloader.add_css("tags", ".entry-meta-hide-on-mobile a::text")
itemloader.add_css("content", ".entry")
itemloader.add_css("favour", ".vote-post-up h10::text")
itemloader.add_css("comment", ".post-adds a span::text")
item = itemloader.load_item()
if not item.get("favour"):
item["favour"] = 0
yield item
pipelines.py
from twisted.enterprise import adbapi
import pymysql
class BlspiderPipeline(object):
def __init__(self, pool):
self.dbpool = pool
@classmethod
def from_settings(cls, settings):
dbparams = dict(
host=settings["MYSQL_HOST"],
database=settings["MYSQL_DB"],
user=settings["MYSQL_USER"],
password=settings["MYSQL_PW"],
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
pool = adbapi.ConnectionPool("pymysql", **dbparams)
return cls(pool)
def process_item(self, item, spider):
if item["image_path"]:
item["image_path"] = item["image_path"][0]["path"]
else:
item["image_path"] = None
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.error_handle, item)
return item
def error_handle(self, error, item):
print(error)
def do_insert(self, cursor, item):
sql,params = item.getinsertsql()
cursor.execute(sql, params)
setting.py
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'blspider.pipelines.BlspiderPipeline': 2,
'scrapy.pipelines.images.ImagesPipeline': 1
}
IMAGES_URLS_FIELD = "image_url"
import os
root = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(root,"images")
IMAGES_RESULT_FIELD = "image_path"
MYSQL_HOST = "localhost"
MYSQL_DB = "..."
MYSQL_USER = "..."
MYSQL_PW = "..."
网友评论