美文网首页
python自动爬取别致数据并保存图片

python自动爬取别致数据并保存图片

作者: 代码没写完休想上厕所 | 来源:发表于2016-11-09 14:05 被阅读86次
    #!/usr/bin/env python
    # -*- coding: utf-8 -*- 
    import urllib2
    import urllib
    import xlrd
    import xlwt
    import xlutils
    from xlutils.copy import copy
    import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    '''
    Created on 11 9, 2016
    
    @author: xwang
    
    脚本使用说明:
    1.同级目录建立“biezhidb.xls”
    2.图片保存至/Users/xunwang/Desktop/别致爬虫/pic/
    '''
    
    baseUrl = "http://chocolateback.sinaapp.com/"
    currentRow = 0;
    
    def testXlrd(filename):
        book = xlrd.open_workbook(filename)
        sh = book.sheet_by_index(0)
        #print sh.nrows, sh.ncols
        # rows = sh.row_values(2)
        return sh.nrows
    
    def testXlwt(filename, index, data):
        book = xlrd.open_workbook(filename)
        sh = book.sheet_by_index(0)
        wsh = copy(book)
        wsh2 = wsh.get_sheet(0)
        wsh2.write(currentRow, index, data)
        wsh.save(filename)
    
    def pachong():
        global currentRow
        for i in range(10000,14000):
            try:
                url = baseUrl + str(i)
                print url
                up = urllib2.urlopen(url)
                cont = up.read()
    
                # url
                head = '<a target="_blank" href="'
                tail = '">'
                ph = cont.find(head)
                pj = cont.find(tail, ph + 1)
                url = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
                if url[7:15] == "redirect":
                    continue
                testXlwt('biezhidb.xls', 2, url)
                # print cont[ph + len(head) : pj].strip()
    
                # print cont
                testXlwt('biezhidb.xls', 0, str(i))
                # title
                head = '<span class="title">'
                tail = '</span>'
                ph = cont.find(head)
                pj = cont.find(tail, ph + 1)
                title = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
                print title
                testXlwt('biezhidb.xls', 1, title)
                #print cont[ph + len(head) : pj].strip()
                # price
                head = '<span class="price">'
                tail = '</span>'
                ph = cont.find(head)
                pj = cont.find(tail, ph + 1)
                price = unicode(cont[ph + len(head) : pj].strip(), "utf-8")
                testXlwt('biezhidb.xls', 3, price)
                # print cont[ph + len(head) : pj].strip()
                # picUrl
                head = '<img src="h'
                tail = '">'
                ph = cont.find(head)
                pj = cont.find(tail, ph + 1)
                picUrl = unicode("h" + cont[ph + len(head) : pj].strip(), "utf-8")
                testXlwt('biezhidb.xls', 4, picUrl)
                storePic(picUrl, i)
                #print "h" + cont[ph + len(head) : pj].strip()
                currentRow = currentRow + 1
            except:
                continue
    
    def storePic(url, id):
        urllib.urlretrieve(url,'/Users/xunwang/Desktop/别致爬虫/pic2/biezhi_%s.jpg' % (id))
    
    if __name__=='__main__':
        currentRow = testXlrd('biezhidb.xls')
        print currentRow
        pachong()
        print "写入完毕!"
    

    相关文章

      网友评论

          本文标题:python自动爬取别致数据并保存图片

          本文链接:https://www.haomeiwen.com/subject/ktzzuttx.html