python 爬取雅昌画廊的数据并存入mysql

作者: 十八度的帝都 | 来源:发表于2017-10-24 18:59 被阅读44次

python 爬取雅昌画廊的数据并存入mysql
python 简单操作MySQL
使用XPath爬取起点网
爬取猫眼电影存入mysql
Python爬虫:基于Scrapy的淘宝登陆后实现数据爬取并保存
Python爬取天气网数据并打包成exe
scrapy爬取数据存入MySQL
简单的爬虫例子
Python爬虫 --- Scrapy爬取黄页88网企业信息
爬虫入门练习（三）爬取小猪租房网信息

代码比较乱，先存一下，后面再加代理和多线程
先从导航栏中提取每个作品的链接

image.png

#Python2.7
# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
import MySQLdb

# 存储数据库
# 参数:数据字典
def DatabaseInfo(data_dictionary):
   try:
       conn = MySQLdb.connect(host='127.0.0.1', user='root',
                              passwd='123456', port=3306, db='gallery')
       cur = conn.cursor()  # 数据库游标

       # 报错:UnicodeEncodeError: 'latin-1' codec can't encode character
       conn.set_character_set('utf8')
       cur.execute('SET NAMES utf8;')
       cur.execute('SET CHARACTER SET utf8;')
       cur.execute('SET character_set_connection=utf8;')

       # SQL语句 智联招聘(zlzp)
       sql = 'insert into imginfor' \
             '(title_text, author, price, size,creation_time, classification' \
             ', material, theme,works_label, applicable_space, personal_profile,' \
             'birthday, native_place, works_show) ' \
             'values(%s, %s, %s, %s, %s, %s,%s, %s, %s, %s, %s,%s,%s,%s)'

       cur.execute(sql, (data_dictionary['title_text'], data_dictionary['author'], data_dictionary['price'],
                         data_dictionary['size'],data_dictionary['creation_time'], data_dictionary['classification'],
                         data_dictionary['material'], data_dictionary['theme'],data_dictionary['works_label'],
                         data_dictionary['applicable_space'], data_dictionary['personal_profile'],data_dictionary['birthday'],
                         data_dictionary['native_place'], data_dictionary['works_show']))
       print '数据库插入成功'
   except MySQLdb.Error, e:
       print "Mysql Error %d: %s" % (e.args[0], e.args[1])
   finally:
       cur.close()
       conn.commit()
       conn.close()

#从主页中提取子链接
def download_link():
   i = 1
   link_list = []
   while i < 10:
       original_url = 'http://gallery.artron.net/works/sheying-all-0-' + str(i) + '.html'
       # 打印图片的主页
       print original_url
       my_headers = {
       'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
       }
       response = requests.get(url=original_url, headers=my_headers)
       contents = response.text
       xml = etree.HTML(contents)
       # 提取子链接
       datas = xml.xpath('//div[@class="pic"]/a/@href')
       # 把子链接存入列表
       for data in datas:
           link_list.append(data)
           #print link_list
       # 另外一种方法
       # soup = BeautifulSoup(contents, "lxml")
       # items = soup.find_all('a',target='_blank')
       # for div in items:
       #    print div.get('href')
       i += 1
       #开始爬取数据
       spider_data(link_list)

#从子链接中爬取所需要的数据
def spider_data(list):
   my_headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
   }
   data_dictionary = {'title_text': '', 'author': '', 'price': '', 'size': '',
                      'creation_time': '', 'classification': '', 'material': '', 'theme': '',
                      'works_label': '', 'applicable_space': '', 'personal_profile': [],
                      'birthday': '', 'native_place': '', 'works_show': []
                      }
   len_list =  len(list)
   for num in range(len_list):
       print "开始爬取第%s链接：%s"%(num+1,list[num])
       response = requests.get(url=list[num], headers=my_headers)
       contents = response.text
       xml = etree.HTML(contents)
       soup = BeautifulSoup(contents, "lxml")
       i = 0
       #图片信息
       for tag in soup.find_all(attrs={"class": "workIntro"}):
           # print tag.get_text()
           #打印标题
           i = i + 1
           title_text = tag.find("h1").get_text()
           title_text = title_text.replace('\n', '')
           data_dictionary['title_text'] = title_text
           print title_text

           # find另一种定位方法 <td class="zwyx">8000-16000</td>
           # 打印作者名字
           author = tag.select('li > a')[0].get('title')
           data_dictionary['author'] = author
           print author

           #打印价格
           price = tag.find('em').get_text()
           data_dictionary['price'] = price
           print price

       #作者信息
       for tag in soup.find_all(attrs={"class": "table"}):
           #尺寸
           size = tag.select('td')[0].get_text().strip()
           data_dictionary['size'] = size
           print size#strip()去掉空格
           #创作年代
           creation_time = tag.select('td')[1].get_text().strip()
           data_dictionary['creation_time'] = creation_time
           print creation_time
           #作品分类
           classification = tag.select('td')[2].get_text().strip()
           data_dictionary['classification']= classification
           print classification
           #材质
           material = tag.select('td')[3].get_text().strip()
           data_dictionary['material'] = material
           print material
           #题材
           theme = tag.select('td')[4].get_text().strip()
           data_dictionary['theme'] = theme
           print theme
           #作品标签
           works_label= tag.select('td')[5].get_text().strip()
           data_dictionary['works_label'] = works_label
           print works_label
           #适用空间
           applicable_space = tag.select('td')[5].get_text().strip()
           data_dictionary['applicable_space'] = applicable_space
           print applicable_space
          # print (u'适用空间' + data_dictionary['applicable_space'])
       #作者介绍
       for tag in soup.find_all(attrs={"class": "introWrap htmlEdit"}):
           #个人简介
           for pnum in tag.select('p'):
               personal_profile= pnum.get_text().strip()
               data_dictionary['personal_profile']=personal_profile
               print personal_profile
               print data_dictionary['personal_profile']
       #个人信息
       for tag in soup.find_all(attrs={"class": "authDetail mt20"}):
           # 出生日期
           birthday = tag.select('td')[0].get_text()
           data_dictionary['birthday'] = birthday
           print birthday
           #籍贯
           native_place = tag.select('td')[1].get_text()
           data_dictionary['native_place'] = native_place
           print native_place

       #作品展示
       for tag in soup.find_all(attrs={"class": "workShowLit"}):
           for imgnum in tag.select('li'):
               works_show =imgnum.get('data-img')
               data_dictionary['works_show'] = works_show
               print works_show


               # 重点：写入MySQL数据库

          #####################################
       if True:
           print u'存入数据库'
           DatabaseInfo(data_dictionary)
           print '\n'
       else:
           print u'爬取作品总数', i

if __name__ == '__main__':
   list = download_link()

数据库中表的建立：

CREATE TABLE `imginfor` (    
  `ID` int(11) NOT NULL AUTO_INCREMENT,    
  `title_text` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '作品名称',    
  `author` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '作者名字',    
  `price` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '价格',    
  `size` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '尺寸',    
  `material` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '材质',  
  `theme` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '题材',  
  `works_lable` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '作品标签',
 `applicable_space` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '适用空间', 
 `personal_profile` varchar(500) COLLATE utf8_bin DEFAULT NULL COMMENT '个人简介', 
 `birthday` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '出生日期', 
 `native_place` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '籍贯',    
  PRIMARY KEY (`ID`)    
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

结果如下：

image.png

数据库：

image.png

python 爬取雅昌画廊的数据并存入mysql
代码比较乱，先存一下，后面再加代理和多线程先从导航栏中提取每个作品的链接数据库中表的建立：结果如下：数据库：
python 简单操作MySQL
前言 python 配置 mysql 通过 python 爬取一些数据，存入数据库并生成简单图表环境linux...
使用XPath爬取起点网
使用XPath简单爬取起点网并将数据存入MySQL数据库
爬取猫眼电影存入mysql
爬取猫眼电影存入mysql
Python爬虫:基于Scrapy的淘宝登陆后实现数据爬取并保存
Python爬虫:基于Scrapy的淘宝登陆后实现数据爬取并保存到Mysql 介绍：本次数据爬取只进行一些简单数...
Python爬取天气网数据并打包成exe
参考文章：使用python爬取全国天气数据并导入MySQL数据库表[https://blog.csdn.net/q...
scrapy爬取数据存入MySQL
创建工程scrapy startproject tutorial 创建蜘蛛scrapy genspider cra...
简单的爬虫例子
爬取的数据存入Excel表格分析要爬取的内容的网页结构：执行：python demo.py 效果生成一个q...
Python爬虫 --- Scrapy爬取黄页88网企业信息
目标：用scrapy爬取黄页88网站所有企业信息，并把爬取的所有信息存入到mysql数据库中。目标分析：通过...
爬虫入门练习（三）爬取小猪租房网信息
声明：本文参考Python实战计划学习笔记2.1：将爬取的数据存入Mongodb其他参考资料：Python爬虫包 ...