蜂鸟网大师作品url:http://image.fengniao.com/list_1586.html
主要实现了:
1、手动输入第几页,保存该页中相册的url到文件中
2、从文件中读取已保存的相册url,以相册标题命名文件夹,在文件夹内保存相册的描述和相册内的图片集
3、也可以指定URL来保存某一个相册中的所有图片
执行过程:

执行结果如下:

实现代码
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import os
import time
class ALBUM:
def __init__(self):
self.albumUrl = 'http://image.fengniao.com/slide/534/5342849_1.html'
self.contentType = "text/html"
self.headers = {'Content-Type':self.contentType}
def getPage(self,url):
try:
request = urllib2.Request(url,headers=self.headers)
response = urllib2.urlopen(request)
# print response.read().decode('gbk')
return response.read().decode('gbk')
except urllib2.URLError,e:
if hasattr(e,"reason"):
print u'获取链接失败,失败原因:'+e.reason
else:
return None
#获取相册标题
def getAlbumTitle(self,content):
parrern = re.compile('<h4 class="img-title">(.*?)</h4>',re.S)
title = re.search(parrern,content)
# print title.group(1).strip()
return title.group(1).strip()
#获取相册描述
def getAlbumDescription(self,content):
parrern = re.compile('<p class="describe-text">(.*?)</p>',re.S)
description = re.search(parrern,content)
# print description.group(1).strip()
return description.group(1).strip()
#获取相册总图片数
def getAlbumImgCount(self,content):
parrern = re.compile('<span class="total-num">(.*?)</span>',re.S)
albumCount = re.search(parrern,content)
# print albumCount.group(1).strip()
return albumCount.group(1).strip()
#获取相册图片
def getAlbumImgs(self,content):
# print content
parrern = re.compile('"current_num".*?"pic_url":"(.*?)"',re.S)
result = re.findall(parrern,content)
# print result.group(1)
imgUrls = []
for i in result:
img = re.sub(r'\\',"",i)
# print img
imgUrls.append(img.encode('utf-8'))
# print imgUrls
# print len(imgUrls)
return imgUrls
#保存图片
def saveImg(self,imageUrl,fileName):
u = urllib.urlopen(imageUrl)
data = u.read()
f = open(fileName,'wb')
f.write(data)
print u'正在保存相册中的图片',fileName
f.close()
#创建新目录
def mkdir(self,path):
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
print u'新建文件夹:',path
os.makedirs(path)
return True
else:
print u'名为:',path,u'已存在'
return False
#保存描述为txt,保存到以title命名的文件夹中
def saveContent(self,content,fileName):
name = 'fengniao'+ '/' + fileName +'/'+ fileName + '.txt'
f = open(name,'w+')
f.write(content.encode('utf-8'))
f.close()
#保存一个相册的图片和描述到同一个文件夹中
def saveAlbum(self,url):
content = self.getPage(url)
# print content
imgCount = self.getAlbumImgCount(content)
imgTitle = self.getAlbumTitle(content)
description = self.getAlbumDescription(content)
path = 'fengniao' + '/' + imgTitle
self.mkdir(path)
self.saveContent(description,imgTitle)
imgUrls = self.getAlbumImgs(content)
number = 1
for imgUrl in imgUrls:
fileName = 'fengniao'+ '/' +imgTitle + '/' + str(number) + '.jpg'
print u'开始保存第'+ str(number) + u'张图片'
self.saveImg(imgUrl,fileName)
number += 1
# 从文件中读取url列表
def readUrls(self,fileName):
name = fileName + '.txt'
if not os.path.exists(fileName):
time.sleep(10)
f = open(name , 'r')
urls = []
for line in f.readlines():
# print(line.strip())
urls.append(line.strip())
f.close()
# if urls == []:
# print u'ao 获取文件内容为空'
# return None
return urls
#保存一页内相册中的图片
def saveAlbums(self,fileName):
number = 1
print u'正在读取文件...'
urls = self.readUrls(fileName)
# print urls
if urls == None:
return
try:
for url in urls:
print u'保存第' + str(number) + u'个相册'
self.saveAlbum(url)
number += 1
except IOError,e:
print u'写入异常。。,错误信息'+ e.message
finally:
print u'写入成功'
# url = 'http://image.fengniao.com/slide/534/5342849_1.html'
# album = ALBUM(url)
# album.getPage(1)
# album.getAlbumTitle()
# album.getAlbumDescription()
# album.getAlbumImgCount()
# imageUrl = album.getAlbumImg()
# album.saveImg(imageUrl,'2.jpg')
# album.saveContent()
# album.saveAlbum()
# album.saveAlbums()
class FNLT:
def __init__(self):
self.file = None
self.siteUrl = 'http://image.fengniao.com/list_1586_'
def getPage(self,pageNum):
try:
url = self.siteUrl + str(pageNum) + ".html"
request = urllib2.Request(url)
response = urllib2.urlopen(request)
# print response.read().decode('gbk')
return response.read().decode('gbk')
except urllib2.URLError,e:
if hasattr(e,"reason"):
print u'打开页面失败...,失败原因是:',e.reason
return None
#获取页数
def getPageNum(self,page):
parrern = re.compile('<div class="page_num".*?</span>(<a.*?</a>){4}.*?<a.*?">(.*?)</a>',re.S)
result = re.search(parrern,page)
# print result.group(2).strip()
return result.group(2).strip()
#获取相册标题
def getAlbumTitle(self,page):
parrern = re.compile('<a class="pic".*?</a>.*?>(.*?)</a>',re.S)
items = re.findall(parrern,page)
# for item in items:
# print item
return items
#获取相册url
def getAlbumAddr(self,page):
parrern = re.compile('<a class="pic" href="(.*?)"><img',re.S)
result = re.findall(parrern,page)
addrs = []
for addr in result:
parrern = re.compile(r'http://image.fengniao.com/slide/.*?')
if not re.match(parrern,addr):
print u'与相册url不匹配...,不保存该url'
else:
addrs.append(addr.encode('utf-8'))
return addrs
#相册链接保存在文件中
def writeData(self,content):
for item in content:
# print u'正在保存链接...'
self.file.write(item)
self.file.write("\n")
def start(self,fileName):
needPageNum = raw_input(u'输入需要保存相册的页码')
if int(needPageNum) == None:
print u'ao 输入错误'
return
print u'正在获取内容,请稍等。。。'
indexPage = self.getPage(1)
pageNum = self.getPageNum(indexPage)
self.file = open(fileName + '.txt','w+')
if pageNum == None:
print u'URL已失效,请重试'
return
try:
print u'写入第' + str(needPageNum) + u'页相册链接'
content = self.getPage(needPageNum)
addrs = self.getAlbumAddr(content)
self.writeData(addrs)
except IOError,e:
if hasattr(e,"reason"):
print u'写出出错啦,错误原因:',e.reason
return None
finally:
print u'写入成功'
self.file.close()
fileName = 'first'
fnlt = FNLT()
album = ALBUM()
#保存url到文件
fnlt.start(fileName)
#保存url下的相册到文件夹
album.saveAlbums(fileName)
# f = open('first1.txt','r')
# print f.readline()
网友评论