美文网首页
python,爬取蜂鸟网大师作品实例

python,爬取蜂鸟网大师作品实例

作者: 威武不能屈 | 来源:发表于2017-03-26 20:40 被阅读101次

    蜂鸟网大师作品url:http://image.fengniao.com/list_1586.html
    主要实现了:
    1、手动输入第几页,保存该页中相册的url到文件中
    2、从文件中读取已保存的相册url,以相册标题命名文件夹,在文件夹内保存相册的描述和相册内的图片集
    3、也可以指定URL来保存某一个相册中的所有图片
    执行过程:

    图1-执行过程

    执行结果如下:

    图2-执行结果

    实现代码

    # -*- coding:utf-8 -*-
    import urllib
    import urllib2
    import re
    import os
    import time
    
    
    
    
    class ALBUM:
    
        def __init__(self):
            self.albumUrl = 'http://image.fengniao.com/slide/534/5342849_1.html'
            self.contentType = "text/html"
            self.headers = {'Content-Type':self.contentType}
    
        def getPage(self,url):
            try:
                request = urllib2.Request(url,headers=self.headers)
                response = urllib2.urlopen(request)
                # print response.read().decode('gbk')
                return response.read().decode('gbk')
            except urllib2.URLError,e:
                if hasattr(e,"reason"):
                    print u'获取链接失败,失败原因:'+e.reason
                else:
                    return None
    
        #获取相册标题
        def getAlbumTitle(self,content):
            parrern = re.compile('<h4 class="img-title">(.*?)</h4>',re.S)
            title = re.search(parrern,content)
            # print title.group(1).strip()
            return title.group(1).strip()
    
    
        #获取相册描述
        def getAlbumDescription(self,content):
            parrern = re.compile('<p class="describe-text">(.*?)</p>',re.S)
            description = re.search(parrern,content)
            # print description.group(1).strip()
            return description.group(1).strip()
    
        #获取相册总图片数
        def getAlbumImgCount(self,content):
            parrern = re.compile('<span class="total-num">(.*?)</span>',re.S)
            albumCount = re.search(parrern,content)
            # print albumCount.group(1).strip()
            return albumCount.group(1).strip()
    
    
    
        #获取相册图片
        def getAlbumImgs(self,content):
            # print content
            parrern = re.compile('"current_num".*?"pic_url":"(.*?)"',re.S)
            result = re.findall(parrern,content)
            # print result.group(1)
            imgUrls = []
            for i in result:
                img = re.sub(r'\\',"",i)
                # print img
                imgUrls.append(img.encode('utf-8'))
    
            # print imgUrls
            # print len(imgUrls)
            return imgUrls
    
        #保存图片
        def saveImg(self,imageUrl,fileName):
            u = urllib.urlopen(imageUrl)
            data = u.read()
            f = open(fileName,'wb')
            f.write(data)
            print u'正在保存相册中的图片',fileName
            f.close()
    
        #创建新目录
        def mkdir(self,path):
            path = path.strip()
            isExists = os.path.exists(path)
            if not isExists:
                print u'新建文件夹:',path
                os.makedirs(path)
                return True
            else:
                print u'名为:',path,u'已存在'
                return False
    
    
        #保存描述为txt,保存到以title命名的文件夹中
        def saveContent(self,content,fileName):
            name = 'fengniao'+ '/' + fileName +'/'+ fileName + '.txt'
            f = open(name,'w+')
            f.write(content.encode('utf-8'))
            f.close()
    
        #保存一个相册的图片和描述到同一个文件夹中
        def saveAlbum(self,url):
            content = self.getPage(url)
            # print content
            imgCount = self.getAlbumImgCount(content)
            imgTitle = self.getAlbumTitle(content)
            description = self.getAlbumDescription(content)
            path = 'fengniao' + '/' + imgTitle
            self.mkdir(path)
            self.saveContent(description,imgTitle)
            imgUrls = self.getAlbumImgs(content)
            number = 1
            for imgUrl in imgUrls:
                fileName = 'fengniao'+ '/' +imgTitle + '/' + str(number) + '.jpg'
                print u'开始保存第'+ str(number) + u'张图片'
                self.saveImg(imgUrl,fileName)
                number += 1
    
        # 从文件中读取url列表
        def readUrls(self,fileName):
            name = fileName + '.txt'
            if not os.path.exists(fileName):
                time.sleep(10)
            f = open(name , 'r')
            urls = []
            for line in f.readlines():
                # print(line.strip())
                urls.append(line.strip())
            f.close()
            # if urls == []:
            #     print u'ao 获取文件内容为空'
            #     return None
            return urls
    
        #保存一页内相册中的图片
        def saveAlbums(self,fileName):
            number = 1
            print u'正在读取文件...'
            urls = self.readUrls(fileName)
            # print urls
            if urls == None:
                return
            try:
                for url in urls:
                    print u'保存第' + str(number) + u'个相册'
                    self.saveAlbum(url)
                    number += 1
            except IOError,e:
                print u'写入异常。。,错误信息'+ e.message
    
            finally:
                print u'写入成功'
    
    
    
    
    # url = 'http://image.fengniao.com/slide/534/5342849_1.html'
    # album = ALBUM(url)
    # album.getPage(1)
    # album.getAlbumTitle()
    # album.getAlbumDescription()
    # album.getAlbumImgCount()
    # imageUrl = album.getAlbumImg()
    # album.saveImg(imageUrl,'2.jpg')
    # album.saveContent()
    # album.saveAlbum()
    # album.saveAlbums()
    
    
    
    
    class FNLT:
    
        def __init__(self):
            self.file = None
            self.siteUrl = 'http://image.fengniao.com/list_1586_'
    
        def getPage(self,pageNum):
            try:
                url = self.siteUrl + str(pageNum) + ".html"
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                # print response.read().decode('gbk')
                return response.read().decode('gbk')
            except urllib2.URLError,e:
                if hasattr(e,"reason"):
                    print u'打开页面失败...,失败原因是:',e.reason
                    return None
    
        #获取页数
        def getPageNum(self,page):
            parrern = re.compile('<div class="page_num".*?</span>(<a.*?</a>){4}.*?<a.*?">(.*?)</a>',re.S)
            result = re.search(parrern,page)
            # print result.group(2).strip()
            return result.group(2).strip()
    
        #获取相册标题
        def getAlbumTitle(self,page):
            parrern = re.compile('<a class="pic".*?</a>.*?>(.*?)</a>',re.S)
            items = re.findall(parrern,page)
            # for item in items:
            #     print item
            return items
    
        #获取相册url
        def getAlbumAddr(self,page):
            parrern = re.compile('<a class="pic" href="(.*?)"><img',re.S)
            result = re.findall(parrern,page)
            addrs = []
            for addr in result:
                parrern = re.compile(r'http://image.fengniao.com/slide/.*?')
                if not re.match(parrern,addr):
                    print u'与相册url不匹配...,不保存该url'
                else:
                    addrs.append(addr.encode('utf-8'))
            return addrs
    
        #相册链接保存在文件中
        def writeData(self,content):
            for item in content:
                # print u'正在保存链接...'
                self.file.write(item)
                self.file.write("\n")
    
    
        def start(self,fileName):
            needPageNum = raw_input(u'输入需要保存相册的页码')
            if int(needPageNum) == None:
                print u'ao 输入错误'
                return
            print u'正在获取内容,请稍等。。。'
            indexPage = self.getPage(1)
            pageNum = self.getPageNum(indexPage)
            self.file = open(fileName + '.txt','w+')
            if pageNum == None:
                print u'URL已失效,请重试'
                return
            try:
                print u'写入第' + str(needPageNum) + u'页相册链接'
                content = self.getPage(needPageNum)
                addrs = self.getAlbumAddr(content)
                self.writeData(addrs)
            except IOError,e:
                if hasattr(e,"reason"):
                    print u'写出出错啦,错误原因:',e.reason
                    return None
            finally:
                print u'写入成功'
                self.file.close()
    
    
    fileName = 'first'
    fnlt = FNLT()
    album = ALBUM()
    
    #保存url到文件
    fnlt.start(fileName)
    #保存url下的相册到文件夹
    album.saveAlbums(fileName)
    
    
    
    # f = open('first1.txt','r')
    # print f.readline()
    

    相关文章

      网友评论

          本文标题:python,爬取蜂鸟网大师作品实例

          本文链接:https://www.haomeiwen.com/subject/whnkottx.html