【适用】
适用于爬虫初学者
【准备】
1、anaconda装好python和爬虫环境
2、略懂python
【目标】
爬免费小说网站的书籍,例如:https://www.kanshushenzhan.com/13238/
【爬虫思路】
get网页 --> 解析处理数据 --> 存储
这条思路适用于一般情况,有些需要扒网站的接口,才能获取完整的数据
【步骤】
1、扒书籍详情页面
项目结构:

代码如下:
# kanshu_spider.py
import scrapy
# 引入item
from kanshu.items import KanshuItem
from scrapy.selector import Selector
class KanshuSpider(scrapy.Spider):
name = "kanshu"
def start_requests(self):
urls = [
'https://www.kanshushenzhan.com/13238/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
print("来到这里")
# 数据容器
item = KanshuItem()
item['section'] = []
try:
# 提取数据
item['book_name'] = response.css('.bookPhr > h2::text').extract_first()
item['o_name'] = item['name']
item['zuozhe'] = response.css('.bookPhr > dl > dd::text').extract_first()
item['fenmian'] = 'https://www.kanshushenzhan.com' + response.css('.bookImg > img::attr(src)').extract_first()
item['jieshao'] = response.css('.introCon > p::text').extract_first()
item['source'] = 'kanshukanshu'
body = response.xpath('//*[@id="yuedu"]/div[2]/ul').extract_first()
selectList = Selector(text=body).css('ul > li > a')
for index, section in enumerate(selectList):
opt = (index + 1, 'https://www.kanshushenzhan.com' + section.css('a::attr(href)').extract_first(),
section.css('a::text').extract_first())
item['section'].append(opt)
except:
print("发生异常2")
return item
item
# items.py
import scrapy
class KanshuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 书籍名
book_name = scrapy.Field()
# 原始书籍名
o_name = scrapy.Field()
# 简介
jieshao= scrapy.Field()
# 作者
zuozhe= scrapy.Field()
# 来源
source = scrapy.Field()
# 封面
fenmian = scrapy.Field()
# 章节list 里面存放tuple
section = scrapy.Field()
pass
pipelines
# pipelines.py
import pymysql
class KanshuPipeline(object):
def __init__(self):
super().__init__()
# 连接数据库
self.connect = pymysql.connect(
host='127.0.0.1', # 数据库地址
port=3306, # 数据库端口
db='testtest', # 数据库名
user='root', # 数据库用户名
passwd='fd', # 数据库密码
charset='utf8', # 编码方式
use_unicode=True)
# 通过cursor执行增删查改
self.cursor = self.connect.cursor()
def process_item(self, item, spider):
try:
arr = (item['book_name'], item['o_name'], item['jieshao'], item['zuozhe'], item['source'], item['fenmian'])
self.cursor.execute(
"""
INSERT INTO book(`book_name`, o_name, jieshao, zuozhe, source, fenmian) values (%s, %s, %s, %s, %s, %s)
""", arr)
# 假设成功了
book_id = self.connect.insert_id()
for opt in item['section']:
arr = (book_id, opt[2], opt[1], opt[0])
self.cursor.execute("""
INSERT INTO book_section(book_id, `s_name`, section_url, chapter) values (%s, %s, %s, %s)
""", arr)
# 提交sql语句
self.connect.commit()
except:
self.connect.rollback()
print("发生异常1")
return item
setting
ITEM_PIPELINES = {
'kanshu.pipelines.KanshuPipeline': 300,
}
此代码仅用于学习
网友评论