这是全部的调试过程,我已经整理成为笔记,这里分享给大家:
python爬取豆瓣两千万图书简介信息:(一)目标API分析
python爬取豆瓣两千万图书简介信息:(二)简单python请求urllib2
python爬取豆瓣两千万图书简介信息:(三)异常处理
python爬取豆瓣两千万图书简介信息:(四)多进程并发
python爬取豆瓣两千万图书简介信息:(五)数据库设计
python爬取豆瓣两千万图书简介信息:(六)数据库操作类
python爬取豆瓣两千万图书简介信息:(七)代理IP
python爬取豆瓣两千万图书简介信息:(八)总结
数据库操作类
数据库建好了,因为业务的单一性和频发性,单独做一个数据库操作类在代码管理上为好。
代码如下:
# -*- coding:utf-8 -*-
import mysql.connector
import gc
import sys
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
def cb_conncet_sql(sql_query):
# 创建数据库连接方法
config = {'host': 'localhost',
'user': 'root',
'password': 'root',
'port': 3306,
'database': 'doubanbook',
'charset': 'utf8',
'buffered': True
}
cnn = mysql.connector.connect(**config)
try:
cursor = cnn.cursor()
cursor.execute(sql_query)
cnn.commit()
cds = cursor.fetchall()
cb_print('查询成功:'+ str(cds))
return cds
except mysql.connector.Error as e:
if format(e) != 'No result set to fetch from.':
cb_print('connect fails!{}'.format(e))
finally:
cursor.close()
cnn.close()
gc.collect()
def insert_book_info(par):
# 解析bookinfo 并拼接SQL,插入数据库
if not par:
cb_print('par 异常')
return
try:
images = par['images']
large_image = images['large']
tags_dict = par['tags']
tags = ''
if tags_dict:
for tag_item in tags_dict:
tags = tags + tag_item['title'] + '&'
tags = tags[:-1]
tags = tags.replace('"', '')
author_arr = par['author']
author = ''
if author_arr:
for author_item in author_arr:
author = author + author_item + '&'
author = author[:-1]
author = author.replace('"', '')
rating = par['rating']
average = rating['average']
numraters = rating['numRaters']
subtitle = par['subtitle']
subtitle = subtitle.replace('"', '\'\'')
if len(subtitle) > 250:
subtitle = subtitle[0:250]
title = par['title']
title = title.replace('"', '\'\'')
publisher = par['publisher']
publisher = publisher.replace('"', '\'\'')
binding = par['binding']
binding = binding.replace('"', '\'\'')
sql_query = ("INSERT INTO books (id, isbn13, publisher, pages,"
" title, image, alt, isbn10, "
"subtitle, "
"price, binding,"
"pubdate, large_image, "
"rating, numRaters,"
"tags, author) VALUES ("
+ par['id'] + ','
+ '"' + par['isbn13'] + '"' + ','
+ '"' + publisher + '"' + ','
+ '"' + par['pages'] + '"' + ','
+ '"' + title + '"' + ','
+ '"' + par['image'] + '"' + ','
+ '"' + par['alt'] + '"' + ','
+ '"' + par['isbn10'] + '"' + ','
+ '"' + subtitle + '"' + ','
+ '"' + par['price'] + '"' + ','
+ '"' + binding + '"' + ','
+ '"' + par['pubdate'] + '"' + ','
+ '"' + large_image + '"' + ','
+ '"' + str(average) + '"' + ','
+ '"' + str(numraters) + '"' + ','
+ '"' + tags + '"' + ','
+ '"' + author + '"' + ');')
cb_conncet_sql(sql_query)
except Exception as e:
if format(e) != 'No result set to fetch from.':
cb_print('connect fails!{}'.format(e))
finally:
gc.collect()
def get_thread_index_id(index):
# 获取第index个进程的最大bookid
try:
sql = ('select max(id) from books WHERE id < '
+ str(1000000 + (index + 1) * 50000) + ';')
cb_print(sql)
result = cb_conncet_sql(sql)
if not result:
return 0
else:
re = result[0]
if not re:
return 0
else:
return re[0]
except Exception as e:
if format(e) != 'No result set to fetch from.':
cb_print('connect fails!{}'.format(e))
finally:
gc.collect()
def insert_error_book_id(index):
# 将异常bookid 插入 error_books表中
try:
sql = 'INSERT INTO error_books (bookid) VALUES (' + str(index) + ');'
cb_conncet_sql(sql)
except Exception as e:
if format(e) != 'No result set to fetch from.':
cb_print('connect fails!{}'.format(e))
finally:
gc.collect()
def insert_none_book_id(index):
# 将不存在书目的bookid 插入 none_books表中
try:
sql = 'INSERT INTO none_books (bookid) VALUES (' + str(index) + ');'
cb_conncet_sql(sql)
except Exception as e:
if format(e) != 'No result set to fetch from.':
cb_print('connect fails!{}'.format(e))
finally:
gc.collect()
网友评论