一.项目结构
![](https://img.haomeiwen.com/i3888998/06a52202eecf041c.png)
main函数作为启动爬虫脚本 spiders文件夹内tangshispider.py为主爬虫程序;
items.py为定义的数据存储字段,和上篇文章建表语句保持一致;
piplines.py为爬虫提交过来的数据存储管道,这里根据items.py里定义的不同item的类区分分别入库;
settings.py为配置脚本,主要启用了pipline的设置
项目git地址:https://github.com/chengcxy/scrapy_spiders/tree/master/tangshi
二.代码
2.1 items.py
# -*- coding: utf-8 -*-
from scrapy import Field, Item
#诗人表字段
class TangshiItem(Item):
chaodai = Field()
poemer = Field()
zuopins_total = Field()
poemer_url = Field()
#作品表字段
class PoemZuopin(Item):
poemer = Field()
poemer_url = Field()
zuopin_name = Field()
name_words = Field()
zuopin_content = Field()
zuopin_words = Field()
zuopin_url = Field()
2.2 主爬虫程序 在spiders文件夹下新建tangshispider.py
#coding:utf8
from scrapy.http import Request
from scrapy.spiders import CrawlSpider
from tangshi.items import TangshiItem,PoemZuopin
class TangShiSpider(CrawlSpider):
name = 'tangshispider'
start_urls = ['http://www.shicimingju.com/category/tangdaishiren/page/1']
base_url = 'http://www.shicimingju.com/category/tangdaishiren/page/{}'
allowed_domains = ['www.shicimingju.com']
def parse(self,response):
total_page = response.xpath('//div[@class="yema"]/text()').extract_first().split('/')[1].replace('共', '').replace('页)', '')
print(total_page)
for i in range(1, int(total_page) + 1):
poem_page_url = self.base_url.format(str(i))
yield Request(url=poem_page_url,callback=self.parse_poem_page)
def parse_poem_page(self, response):
print('parse_poem_page函数解析--->%s' % response.url)
poems = response.xpath('//div[@class="shirenlist"]//a')
for poem in poems:
poemer_item = TangshiItem()
poemer_url = 'http://www.shicimingju.com' + poem.xpath('@href').extract_first()
poemer = poem.xpath('text()').extract_first()
poemer_item['poemer_url'] = poemer_url
poemer_item['poemer'] = poemer
yield Request(url=poemer_url,callback=self.parse_poem,meta={'item':poemer_item})
def parse_poem(self,response):
poemer_item = response.meta['item']
# 得到诗人作品集的总作品数 构建分页
zuopins_total = response.xpath('//div[@class="num"]/b/text()').extract_first()
poemer_item['chaodai'] = '唐朝'
poemer_item['zuopins_total'] = zuopins_total
yield poemer_item
# 根据 得到诗人作品集的总作品数 得到诗人作品集的每一页请求
zuopin_page_base_url = poemer_item['poemer_url'].replace('.html', '') + '_{}.html'
divmod_num = [i for i in divmod(int(zuopins_total), 40)]
pages = divmod_num[0] if divmod_num[1] == 0 else divmod_num[0] + 1
for page in range(1, int(pages) + 1):
zuopin_page_url = zuopin_page_base_url.format(page)
yield Request(url=zuopin_page_url,callback=self.parse_page_zuopin)
def parse_page_zuopin(self, response):
zuopin_pages = response.xpath('//div[@class="shicilist"]/ul/li[1]/a')
poemer_url = 'http://www.shicimingju.com' + response.xpath('//div[@class="shicilist"]/ul/li[2]/a[2]/@href').extract_first()
poemer = response.xpath('//div[@class="shicilist"]/ul/li[2]/a[2]/em/text()').extract_first()
for zuopin_page in zuopin_pages:
item2 = {}
zuopin_url = 'http://www.shicimingju.com' + zuopin_page.xpath('@href').extract_first()
zuopin_name = zuopin_page.xpath('text()').extract_first()
print('作者:%s,作者url:%s,作品==>%s,作品url==>%s' % (poemer, poemer_url, zuopin_name, zuopin_url))
item2['poemer_url'] = poemer_url
item2['poemer'] = poemer
item2['zuopin_url'] = zuopin_url
item2['zuopin_name'] = zuopin_name
yield Request(url=zuopin_url,callback=self.parse_zuopin_detail,meta={'item2':item2})
# 解析作品详情页
def parse_zuopin_detail(self, response):
item=response.meta['item2']
print('parse_zuopin_detail函数解析--->%s' % item['zuopin_url'])
zuopin_item = PoemZuopin()
zuopin_item['poemer'] = item['poemer']
zuopin_item['poemer_url'] = item['poemer_url']
zuopin_item['zuopin_name'] = item['zuopin_name']
zuopin_item['name_words'] = len(item['zuopin_name'])
zuopin_item['zuopin_url'] = item['zuopin_url']
try:
zuopin_content = response.xpath('//div[@class="shicineirong"]//text()').extract()
zuopin_item['zuopin_content'] = ''.join([x.strip() for x in zuopin_content])
zuopin_item['zuopin_words'] = len(zuopin_item['zuopin_content'].replace(',', '').replace('。', ''))
except:
zuopin_item['zuopin_content'] = '抓取失败无数据'
zuopin_item['zuopin_words'] = 0
print(zuopin_item)
yield zuopin_item
2.3 piplines.py 数据处理管道 入数据库操作
# -*- coding: utf-8 -*-
import pymysql
from tangshi.items import TangshiItem,PoemZuopin
class TangshiPipeline(object):
def __init__(self):
self.MYSQL_CONFIG = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': '密码',
'db': 'local_db',
'charset': 'utf8'
}
self.conn = pymysql.connect(**self.MYSQL_CONFIG)
def process_item(self, item, spider):
if isinstance(item,TangshiItem):
poemers = ['chaodai', 'poemer', 'zuopins_total', 'poemer_url']
poemers_base_sql = 'insert into poemers ({}) values(%s,%s,%s,%s)'
poemers_sql = poemers_base_sql.format(','.join(poemers))
conn = self.conn
cursor = conn.cursor()
cursor.execute(poemers_sql,(item['chaodai'], item['poemer'], item['zuopins_total'], item['poemer_url']))
conn.commit()
elif isinstance(item, PoemZuopin):
zuopins = ['poemer', 'poemer_url', 'zuopin_name', 'name_words', 'zuopin_content', 'zuopin_words',
'zuopin_url']
zuopin_base_sql = 'insert into poem_zuopin ({}) values(%s,%s,%s,%s,%s,%s,%s)'
zuopin_sql = zuopin_base_sql.format(','.join(zuopins))
conn = self.conn
cursor = conn.cursor()
cursor.execute(zuopin_sql,(item['poemer'], item['poemer_url'], item['zuopin_name'], item['name_words'],item['zuopin_content'], item['zuopin_words'], item['zuopin_url']))
conn.commit()
2.4 settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'tangshi'
SPIDER_MODULES = ['tangshi.spiders']
NEWSPIDER_MODULE = 'tangshi.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'tangshi.pipelines.TangshiPipeline': 300,
}
2.5 main.py 启动爬虫脚本
from scrapy.cmdline import execute
execute('scrapy crawl tangshispider'.split(' '))
网友评论