美文网首页
python 爬取雅昌画廊的数据并存入mysql

python 爬取雅昌画廊的数据并存入mysql

作者: 十八度的帝都 | 来源:发表于2017-10-24 18:59 被阅读44次

代码比较乱,先存一下,后面再加代理和多线程
先从导航栏中提取每个作品的链接

image.png image.png
#Python2.7
# _*_ coding:utf-8 _*_
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
import MySQLdb

# 存储数据库
# 参数:数据字典
def DatabaseInfo(data_dictionary):
   try:
       conn = MySQLdb.connect(host='127.0.0.1', user='root',
                              passwd='123456', port=3306, db='gallery')
       cur = conn.cursor()  # 数据库游标

       # 报错:UnicodeEncodeError: 'latin-1' codec can't encode character
       conn.set_character_set('utf8')
       cur.execute('SET NAMES utf8;')
       cur.execute('SET CHARACTER SET utf8;')
       cur.execute('SET character_set_connection=utf8;')

       # SQL语句 智联招聘(zlzp)
       sql = 'insert into imginfor' \
             '(title_text, author, price, size,creation_time, classification' \
             ', material, theme,works_label, applicable_space, personal_profile,' \
             'birthday, native_place, works_show) ' \
             'values(%s, %s, %s, %s, %s, %s,%s, %s, %s, %s, %s,%s,%s,%s)'

       cur.execute(sql, (data_dictionary['title_text'], data_dictionary['author'], data_dictionary['price'],
                         data_dictionary['size'],data_dictionary['creation_time'], data_dictionary['classification'],
                         data_dictionary['material'], data_dictionary['theme'],data_dictionary['works_label'],
                         data_dictionary['applicable_space'], data_dictionary['personal_profile'],data_dictionary['birthday'],
                         data_dictionary['native_place'], data_dictionary['works_show']))
       print '数据库插入成功'
   except MySQLdb.Error, e:
       print "Mysql Error %d: %s" % (e.args[0], e.args[1])
   finally:
       cur.close()
       conn.commit()
       conn.close()

#从主页中提取子链接
def download_link():
   i = 1
   link_list = []
   while i < 10:
       original_url = 'http://gallery.artron.net/works/sheying-all-0-' + str(i) + '.html'
       # 打印图片的主页
       print original_url
       my_headers = {
       'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
       }
       response = requests.get(url=original_url, headers=my_headers)
       contents = response.text
       xml = etree.HTML(contents)
       # 提取子链接
       datas = xml.xpath('//div[@class="pic"]/a/@href')
       # 把子链接存入列表
       for data in datas:
           link_list.append(data)
           #print link_list
       # 另外一种方法
       # soup = BeautifulSoup(contents, "lxml")
       # items = soup.find_all('a',target='_blank')
       # for div in items:
       #    print div.get('href')
       i += 1
       #开始爬取数据
       spider_data(link_list)

#从子链接中爬取所需要的数据
def spider_data(list):
   my_headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
   }
   data_dictionary = {'title_text': '', 'author': '', 'price': '', 'size': '',
                      'creation_time': '', 'classification': '', 'material': '', 'theme': '',
                      'works_label': '', 'applicable_space': '', 'personal_profile': [],
                      'birthday': '', 'native_place': '', 'works_show': []
                      }
   len_list =  len(list)
   for num in range(len_list):
       print "开始爬取第%s链接:%s"%(num+1,list[num])
       response = requests.get(url=list[num], headers=my_headers)
       contents = response.text
       xml = etree.HTML(contents)
       soup = BeautifulSoup(contents, "lxml")
       i = 0
       #图片信息
       for tag in soup.find_all(attrs={"class": "workIntro"}):
           # print tag.get_text()
           #打印标题
           i = i + 1
           title_text = tag.find("h1").get_text()
           title_text = title_text.replace('\n', '')
           data_dictionary['title_text'] = title_text
           print title_text

           # find另一种定位方法 <td class="zwyx">8000-16000</td>
           # 打印作者名字
           author = tag.select('li > a')[0].get('title')
           data_dictionary['author'] = author
           print author

           #打印价格
           price = tag.find('em').get_text()
           data_dictionary['price'] = price
           print price

       #作者信息
       for tag in soup.find_all(attrs={"class": "table"}):
           #尺寸
           size = tag.select('td')[0].get_text().strip()
           data_dictionary['size'] = size
           print size#strip()去掉空格
           #创作年代
           creation_time = tag.select('td')[1].get_text().strip()
           data_dictionary['creation_time'] = creation_time
           print creation_time
           #作品分类
           classification = tag.select('td')[2].get_text().strip()
           data_dictionary['classification']= classification
           print classification
           #材质
           material = tag.select('td')[3].get_text().strip()
           data_dictionary['material'] = material
           print material
           #题材
           theme = tag.select('td')[4].get_text().strip()
           data_dictionary['theme'] = theme
           print theme
           #作品标签
           works_label= tag.select('td')[5].get_text().strip()
           data_dictionary['works_label'] = works_label
           print works_label
           #适用空间
           applicable_space = tag.select('td')[5].get_text().strip()
           data_dictionary['applicable_space'] = applicable_space
           print applicable_space
          # print (u'适用空间' + data_dictionary['applicable_space'])
       #作者介绍
       for tag in soup.find_all(attrs={"class": "introWrap htmlEdit"}):
           #个人简介
           for pnum in tag.select('p'):
               personal_profile= pnum.get_text().strip()
               data_dictionary['personal_profile']=personal_profile
               print personal_profile
               print data_dictionary['personal_profile']
       #个人信息
       for tag in soup.find_all(attrs={"class": "authDetail mt20"}):
           # 出生日期
           birthday = tag.select('td')[0].get_text()
           data_dictionary['birthday'] = birthday
           print birthday
           #籍贯
           native_place = tag.select('td')[1].get_text()
           data_dictionary['native_place'] = native_place
           print native_place

       #作品展示
       for tag in soup.find_all(attrs={"class": "workShowLit"}):
           for imgnum in tag.select('li'):
               works_show =imgnum.get('data-img')
               data_dictionary['works_show'] = works_show
               print works_show


               # 重点:写入MySQL数据库

          #####################################
       if True:
           print u'存入数据库'
           DatabaseInfo(data_dictionary)
           print '\n'
       else:
           print u'爬取作品总数', i

if __name__ == '__main__':
   list = download_link()

数据库中表的建立:

CREATE TABLE `imginfor` (    
  `ID` int(11) NOT NULL AUTO_INCREMENT,    
  `title_text` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '作品名称',    
  `author` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '作者名字',    
  `price` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '价格',    
  `size` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '尺寸',    
  `material` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '材质',  
  `theme` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '题材',  
  `works_lable` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '作品标签',
 `applicable_space` varchar(200) COLLATE utf8_bin DEFAULT NULL COMMENT '适用空间', 
 `personal_profile` varchar(500) COLLATE utf8_bin DEFAULT NULL COMMENT '个人简介', 
 `birthday` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '出生日期', 
 `native_place` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '籍贯',    
  PRIMARY KEY (`ID`)    
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;  

结果如下:

image.png

数据库:

image.png

相关文章

网友评论

      本文标题:python 爬取雅昌画廊的数据并存入mysql

      本文链接:https://www.haomeiwen.com/subject/oguxpxtx.html