我们下载item 文件写
导入这个文件
from scrapy.loader import ItemLoader
自定义的Loader
class ArticleItemLoader(ItemLoader):
#自定义itemloader
default_output_processor = TakeFirst()
自定义item类
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
create_date = scrapy.Field(
# 自定义日期转换
input_processor=MapCompose(date_convert),
)
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")
)
content = scrapy.Field()
def get_insert_sql(self):
insert_sql = """
insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path,
praise_nums, comment_nums, tags, content)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums)
"""
fron_image_url = ""
# content = remove_tags(self["content"])
if self["front_image_url"]:
fron_image_url = self["front_image_url"][0]
params = (self["title"], self["url"], self["create_date"], self["fav_nums"],
fron_image_url, self["front_image_path"], self["praise_nums"], self["comment_nums"],
self["tags"], self["content"])
return insert_sql, params
input_processor 和 output_processor 转换方法要是处理方法
def date_convert(value):
try:
create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def get_nums(value):
match_re = re.match(".*?(\d+).*", value)
if match_re:
nums = int(match_re.group(1))
else:
nums = 0
return nums
def return_value(value):
return value
def remove_comment_tags(value):
#去掉tag中提取的评论
if "评论" in value:
return ""
else:
return value
我们在解析文件使用自定义ArticleItemLoader
def parse_detail(self, response):
article_item = JobBoleArticleItem()
#通过item loader加载item
front_image_url = response.meta.get("front_image_url", "") # 文章封面图
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
item_loader.add_value("front_image_url", [front_image_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
item_loader.add_css("content", "div.entry")
article_item = item_loader.load_item()
yield article_item
网友评论