代码比较乱,先存一下,后面再加代理和多线程
先从导航栏中提取每个作品的链接
#Python2.7
# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
import MySQLdb
# 存储数据库
# 参数:数据字典
def DatabaseInfo(data_dictionary):
try:
conn = MySQLdb.connect(host='127.0.0.1', user='root',
passwd='123456', port=3306, db='gallery')
cur = conn.cursor() # 数据库游标
# 报错:UnicodeEncodeError: 'latin-1' codec can't encode character
conn.set_character_set('utf8')
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
# SQL语句 智联招聘(zlzp)
sql = 'insert into imginfor' \
'(title_text, author, price, size,creation_time, classification' \
', material, theme,works_label, applicable_space, personal_profile,' \
'birthday, native_place, works_show) ' \
'values(%s, %s, %s, %s, %s, %s,%s, %s, %s, %s, %s,%s,%s,%s)'
cur.execute(sql, (data_dictionary['title_text'], data_dictionary['author'], data_dictionary['price'],
data_dictionary['size'],data_dictionary['creation_time'], data_dictionary['classification'],
data_dictionary['material'], data_dictionary['theme'],data_dictionary['works_label'],
data_dictionary['applicable_space'], data_dictionary['personal_profile'],data_dictionary['birthday'],
data_dictionary['native_place'], data_dictionary['works_show']))
print '数据库插入成功'
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
finally:
cur.close()
conn.commit()
conn.close()
#从主页中提取子链接
def download_link():
i = 1
link_list = []
while i < 10:
original_url = 'http://gallery.artron.net/works/sheying-all-0-' + str(i) + '.html'
# 打印图片的主页
print original_url
my_headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
response = requests.get(url=original_url, headers=my_headers)
contents = response.text
xml = etree.HTML(contents)
# 提取子链接
datas = xml.xpath('//div[@class="pic"]/a/@href')
# 把子链接存入列表
for data in datas:
link_list.append(data)
#print link_list
# 另外一种方法
# soup = BeautifulSoup(contents, "lxml")
# items = soup.find_all('a',target='_blank')
# for div in items:
# print div.get('href')
i += 1
#开始爬取数据
spider_data(link_list)
#从子链接中爬取所需要的数据
def spider_data(list):
my_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
}
data_dictionary = {'title_text': '', 'author': '', 'price': '', 'size': '',
'creation_time': '', 'classification': '', 'material': '', 'theme': '',
'works_label': '', 'applicable_space': '', 'personal_profile': [],
'birthday': '', 'native_place': '', 'works_show': []
}
len_list = len(list)
for num in range(len_list):
print "开始爬取第%s链接:%s"%(num+1,list[num])
response = requests.get(url=list[num], headers=my_headers)
contents = response.text
xml = etree.HTML(contents)
soup = BeautifulSoup(contents, "lxml")
i = 0
#图片信息
for tag in soup.find_all(attrs={"class": "workIntro"}):
# print tag.get_text()
#打印标题
i = i + 1
title_text = tag.find("h1").get_text()
title_text = title_text.replace('\n', '')
data_dictionary['title_text'] = title_text
print title_text
# find另一种定位方法 <td class="zwyx">8000-16000</td>
# 打印作者名字
author = tag.select('li > a')[0].get('title')
data_dictionary['author'] = author
print author
#打印价格
price = tag.find('em').get_text()
data_dictionary['price'] = price
print price
#作者信息
for tag in soup.find_all(attrs={"class": "table"}):
#尺寸
size = tag.select('td')[0].get_text().strip()
data_dictionary['size'] = size
print size#strip()去掉空格
#创作年代
creation_time = tag.select('td')[1].get_text().strip()
data_dictionary['creation_time'] = creation_time
print creation_time
#作品分类
classification = tag.select('td')[2].get_text().strip()
data_dictionary['classification']= classification
print classification
#材质
material = tag.select('td')[3].get_text().strip()
data_dictionary['material'] = material
print material
#题材
theme = tag.select('td')[4].get_text().strip()
data_dictionary['theme'] = theme
print theme
#作品标签
works_label= tag.select('td')[5].get_text().strip()
data_dictionary['works_label'] = works_label
print works_label
#适用空间
applicable_space = tag.select('td')[5].get_text().strip()
data_dictionary['applicable_space'] = applicable_space
print applicable_space
# print (u'适用空间' + data_dictionary['applicable_space'])
#作者介绍
for tag in soup.find_all(attrs={"class": "introWrap htmlEdit"}):
#个人简介
for pnum in tag.select('p'):
personal_profile= pnum.get_text().strip()
data_dictionary['personal_profile']=personal_profile
print personal_profile
print data_dictionary['personal_profile']
#个人信息
for tag in soup.find_all(attrs={"class": "authDetail mt20"}):
# 出生日期
birthday = tag.select('td')[0].get_text()
data_dictionary['birthday'] = birthday
print birthday
#籍贯
native_place = tag.select('td')[1].get_text()
data_dictionary['native_place'] = native_place
print native_place
#作品展示
for tag in soup.find_all(attrs={"class": "workShowLit"}):
for imgnum in tag.select('li'):
works_show =imgnum.get('data-img')
data_dictionary['works_show'] = works_show
print works_show
# 重点:写入MySQL数据库
#####################################
if True:
print u'存入数据库'
DatabaseInfo(data_dictionary)
print '\n'
else:
print u'爬取作品总数', i
if __name__ == '__main__':
list = download_link()
数据库中表的建立:
CREATE TABLE `imginfor` (
`ID` int(11) NOT NULL AUTO_INCREMENT,
`title_text` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '作品名称',
`author` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '作者名字',
`price` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '价格',
`size` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '尺寸',
`material` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '材质',
`theme` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '题材',
`works_lable` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '作品标签',
`applicable_space` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '适用空间',
`personal_profile` varchar(500) COLLATE utf8_bin DEFAULT NULL COMMENT '个人简介',
`birthday` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '出生日期',
`native_place` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '籍贯',
PRIMARY KEY (`ID`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
结果如下:
image.png数据库:
image.png
网友评论