美文网首页
python,爬取蜂鸟网大师作品实例

python,爬取蜂鸟网大师作品实例

作者: 威武不能屈 | 来源:发表于2017-03-26 20:40 被阅读101次

蜂鸟网大师作品url:http://image.fengniao.com/list_1586.html
主要实现了:
1、手动输入第几页,保存该页中相册的url到文件中
2、从文件中读取已保存的相册url,以相册标题命名文件夹,在文件夹内保存相册的描述和相册内的图片集
3、也可以指定URL来保存某一个相册中的所有图片
执行过程:

图1-执行过程

执行结果如下:

图2-执行结果

实现代码

# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import os
import time




class ALBUM:

    def __init__(self):
        self.albumUrl = 'http://image.fengniao.com/slide/534/5342849_1.html'
        self.contentType = "text/html"
        self.headers = {'Content-Type':self.contentType}

    def getPage(self,url):
        try:
            request = urllib2.Request(url,headers=self.headers)
            response = urllib2.urlopen(request)
            # print response.read().decode('gbk')
            return response.read().decode('gbk')
        except urllib2.URLError,e:
            if hasattr(e,"reason"):
                print u'获取链接失败,失败原因:'+e.reason
            else:
                return None

    #获取相册标题
    def getAlbumTitle(self,content):
        parrern = re.compile('<h4 class="img-title">(.*?)</h4>',re.S)
        title = re.search(parrern,content)
        # print title.group(1).strip()
        return title.group(1).strip()


    #获取相册描述
    def getAlbumDescription(self,content):
        parrern = re.compile('<p class="describe-text">(.*?)</p>',re.S)
        description = re.search(parrern,content)
        # print description.group(1).strip()
        return description.group(1).strip()

    #获取相册总图片数
    def getAlbumImgCount(self,content):
        parrern = re.compile('<span class="total-num">(.*?)</span>',re.S)
        albumCount = re.search(parrern,content)
        # print albumCount.group(1).strip()
        return albumCount.group(1).strip()



    #获取相册图片
    def getAlbumImgs(self,content):
        # print content
        parrern = re.compile('"current_num".*?"pic_url":"(.*?)"',re.S)
        result = re.findall(parrern,content)
        # print result.group(1)
        imgUrls = []
        for i in result:
            img = re.sub(r'\\',"",i)
            # print img
            imgUrls.append(img.encode('utf-8'))

        # print imgUrls
        # print len(imgUrls)
        return imgUrls

    #保存图片
    def saveImg(self,imageUrl,fileName):
        u = urllib.urlopen(imageUrl)
        data = u.read()
        f = open(fileName,'wb')
        f.write(data)
        print u'正在保存相册中的图片',fileName
        f.close()

    #创建新目录
    def mkdir(self,path):
        path = path.strip()
        isExists = os.path.exists(path)
        if not isExists:
            print u'新建文件夹:',path
            os.makedirs(path)
            return True
        else:
            print u'名为:',path,u'已存在'
            return False


    #保存描述为txt,保存到以title命名的文件夹中
    def saveContent(self,content,fileName):
        name = 'fengniao'+ '/' + fileName +'/'+ fileName + '.txt'
        f = open(name,'w+')
        f.write(content.encode('utf-8'))
        f.close()

    #保存一个相册的图片和描述到同一个文件夹中
    def saveAlbum(self,url):
        content = self.getPage(url)
        # print content
        imgCount = self.getAlbumImgCount(content)
        imgTitle = self.getAlbumTitle(content)
        description = self.getAlbumDescription(content)
        path = 'fengniao' + '/' + imgTitle
        self.mkdir(path)
        self.saveContent(description,imgTitle)
        imgUrls = self.getAlbumImgs(content)
        number = 1
        for imgUrl in imgUrls:
            fileName = 'fengniao'+ '/' +imgTitle + '/' + str(number) + '.jpg'
            print u'开始保存第'+ str(number) + u'张图片'
            self.saveImg(imgUrl,fileName)
            number += 1

    # 从文件中读取url列表
    def readUrls(self,fileName):
        name = fileName + '.txt'
        if not os.path.exists(fileName):
            time.sleep(10)
        f = open(name , 'r')
        urls = []
        for line in f.readlines():
            # print(line.strip())
            urls.append(line.strip())
        f.close()
        # if urls == []:
        #     print u'ao 获取文件内容为空'
        #     return None
        return urls

    #保存一页内相册中的图片
    def saveAlbums(self,fileName):
        number = 1
        print u'正在读取文件...'
        urls = self.readUrls(fileName)
        # print urls
        if urls == None:
            return
        try:
            for url in urls:
                print u'保存第' + str(number) + u'个相册'
                self.saveAlbum(url)
                number += 1
        except IOError,e:
            print u'写入异常。。,错误信息'+ e.message

        finally:
            print u'写入成功'




# url = 'http://image.fengniao.com/slide/534/5342849_1.html'
# album = ALBUM(url)
# album.getPage(1)
# album.getAlbumTitle()
# album.getAlbumDescription()
# album.getAlbumImgCount()
# imageUrl = album.getAlbumImg()
# album.saveImg(imageUrl,'2.jpg')
# album.saveContent()
# album.saveAlbum()
# album.saveAlbums()




class FNLT:

    def __init__(self):
        self.file = None
        self.siteUrl = 'http://image.fengniao.com/list_1586_'

    def getPage(self,pageNum):
        try:
            url = self.siteUrl + str(pageNum) + ".html"
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            # print response.read().decode('gbk')
            return response.read().decode('gbk')
        except urllib2.URLError,e:
            if hasattr(e,"reason"):
                print u'打开页面失败...,失败原因是:',e.reason
                return None

    #获取页数
    def getPageNum(self,page):
        parrern = re.compile('<div class="page_num".*?</span>(<a.*?</a>){4}.*?<a.*?">(.*?)</a>',re.S)
        result = re.search(parrern,page)
        # print result.group(2).strip()
        return result.group(2).strip()

    #获取相册标题
    def getAlbumTitle(self,page):
        parrern = re.compile('<a class="pic".*?</a>.*?>(.*?)</a>',re.S)
        items = re.findall(parrern,page)
        # for item in items:
        #     print item
        return items

    #获取相册url
    def getAlbumAddr(self,page):
        parrern = re.compile('<a class="pic" href="(.*?)"><img',re.S)
        result = re.findall(parrern,page)
        addrs = []
        for addr in result:
            parrern = re.compile(r'http://image.fengniao.com/slide/.*?')
            if not re.match(parrern,addr):
                print u'与相册url不匹配...,不保存该url'
            else:
                addrs.append(addr.encode('utf-8'))
        return addrs

    #相册链接保存在文件中
    def writeData(self,content):
        for item in content:
            # print u'正在保存链接...'
            self.file.write(item)
            self.file.write("\n")


    def start(self,fileName):
        needPageNum = raw_input(u'输入需要保存相册的页码')
        if int(needPageNum) == None:
            print u'ao 输入错误'
            return
        print u'正在获取内容,请稍等。。。'
        indexPage = self.getPage(1)
        pageNum = self.getPageNum(indexPage)
        self.file = open(fileName + '.txt','w+')
        if pageNum == None:
            print u'URL已失效,请重试'
            return
        try:
            print u'写入第' + str(needPageNum) + u'页相册链接'
            content = self.getPage(needPageNum)
            addrs = self.getAlbumAddr(content)
            self.writeData(addrs)
        except IOError,e:
            if hasattr(e,"reason"):
                print u'写出出错啦,错误原因:',e.reason
                return None
        finally:
            print u'写入成功'
            self.file.close()


fileName = 'first'
fnlt = FNLT()
album = ALBUM()

#保存url到文件
fnlt.start(fileName)
#保存url下的相册到文件夹
album.saveAlbums(fileName)



# f = open('first1.txt','r')
# print f.readline()

相关文章

网友评论

      本文标题:python,爬取蜂鸟网大师作品实例

      本文链接:https://www.haomeiwen.com/subject/whnkottx.html