前言
由于事前太多,本来想一点点填坑的,现在只能直接把坑填了。P.S.其实还剩余了部分想做的内容没有做完,但基础的下载器功能已经写完了
TO DO LIST
1.进度条改进成装饰器模式(若可以,目前来看应该不行)
2.判断文件完整性
a.通过文件大小来判断(不支持分块的其实也无法通过大小来进行判断) (DONE)
b.md5,不是每个下载资源都有该字段
3.断线重连/断线续传
4.支持BT文件下载
5.图形化
项目拆解
很简单的3步走
1.判断文件是否需要拆成多个下载
2.根据判断,把文件分成多个分块后,建立对应的分块下载
3.把分块合并成一个完整的文件,校验下文件完整性
第一步.判断文件是否拆成多个文件下载
import需要的包-requests。在正式下载前,需要有一个获取文件大小的动作。获取文件大小后,对文件进行拆分,返回对应的数据分块。然后再创建协程下载
import requests
# 判断资源是否需要分块下载
# 整块下载-返回格式:{'code':0, 'downloadFileName':'xxxxxxxxxxx', 'urlList':[{'fileName':xxxx.download}]}
# 分块下载-返回格式:{'code':1, 'downloadFileName':'xxxxxxxxxxx', 'urlList':[{'fileName':xxxx.download, 'Range':'bytes=1-100'}]}
# 100MB一个分块
def cutOrHold(url, downloadSize = 1024 * 1024 * 100):
# 返回的数据类型
data = {
'code' : 0,
'downloadFileName' : '',
'fileSize' : 0,
'urlList' : []
}
我自己约定的一个数据类型,方便我在其他地方进行调用。主要告知我是否需要分块,文件名命名是什么,文件总大小,以及每个分块。具体数据结构可看注释
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
r = requests.get(url = url, stream=True, headers = HEADERS)
我发现不用header也可以正常请求
newHeader = {}
for key in r.headers:
newHeader[key.lower()] = r.headers[key].lower()
# print (newHeader)
#判断是否有返回文件名,没有就取链接最后一截
if 'content-disposition' in newHeader.keys():
fileName = newHeader['content-disposition'].split('=')[1].replace('\"','')
else:
fileName = url.split('/')[-1]
data['downloadFileName'] = fileName
fileName = fileName + '.download'
将下载文件临时命名为xxx.download;如果有多个分块文件下载,临时文件命名为:xxx.download_part1(后面有处理)
# 判断是否需要分块
if 'content-length' not in newHeader.keys():
data['urlList'].append({'fileName':fileName})
return data
if 'content-length' in newHeader.keys():
total = float(newHeader['content-length'])
count = int(total/downloadSize) + 1
data['fileSize'] = total
# 如果下载的文件比较小,就不需要进行分块下载
if downloadSize >= total:
data['urlList'].append({'fileName':fileName})
return data
# 临时的文件名命名为:xxx.dmg.download_part3
downloadHeaderList = [{'fileName':fileName +'_part'+str(_),'Range': f'bytes={_ * downloadSize}-{(_+1) * downloadSize-1}'} if (_+1) * downloadSize <= total else {'fileName':fileName +'_part'+str(_) ,'Range': f'bytes={_ * downloadSize}-{int(total)}'} for _ in range(0, count)]
downloadHeaderList 使用了列表生成式,数据结构为:[{'fileName':xxxx.download_part1, 'Range':'bytes=1-100'},{'fileName':xxxx.download_part2, 'Range':'bytes=101-200'}];这里稍微有点复杂的是:如何直接通过数学公式把分块给分对
data['code'] = 1
data['urlList'] = downloadHeaderList
return data
第二步.根据判断,把文件分成多个分块后,建立对应的分块下载
注入灵魂,import对应的库
import requests, aiohttp, aiofiles, asyncio
import os,time
import tools
from tqdm import tqdm
下面是正式的代码
class downLoader():
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
# 100MB一个分块
downloadSize = 1024 * 1024 * 100
def __init__(self,url):
self.url = url
self.canCut = 1
self.fileName = ''
self.fileSize = 0
self.downPath = self.downloadPath()
self.urlList = self.cutFileSize()
# 创建session
async def createSession(self):
session = aiohttp.ClientSession(headers=downLoader.HEADERS)
return session
# 看要下载到哪里
def downloadPath(self):
basepath = os.path.abspath(os.path.dirname(__file__))
down_path = os.path.join(basepath, 'downloads')
if not os.path.isdir(down_path):
os.mkdir(down_path)
print ('Create download path {}'.format(down_path))
return down_path
# 用这个方法来判断是否需要分块
def cutFileSize(self):
data = tools.cutOrHold(self.url, downLoader.downloadSize)
self.canCut = data['code']
self.fileName = data['downloadFileName']
self.fileSize = data['fileSize']
return data['urlList']
# 分块下载器主要下载代码
async def downloadMuti(self, urlMeta, session, bar):
headers = downLoader.HEADERS
headers['Range'] = urlMeta['Range']
filePath = self.downPath + '/' + urlMeta['fileName']
async with session.get(self.url, headers = headers, verify_ssl = False) as resp:
async with aiofiles.open(filePath, 'wb') as f:
while 1:
chunk = await resp.content.read(1024)
if not chunk:
break
bar.update(1024)
await f.write(chunk)
# 整块下载器主要下载代码
async def downloadOne(self, urlMeta, bar):
headers = downLoader.HEADERS
filePath = self.downPath + '/' + self.fileName
print (filePath)
print ('--------------')
async with aiohttp.request('GET', self.url, headers = headers) as resp:
async with aiofiles.open(filePath, 'wb') as f:
while 1:
chunk = await resp.content.read(1024)
if not chunk:
break
bar.update(1024)
await f.write(chunk)
# 不切文件,整个下载
def doNotCut(self, bar):
fileName = self.urlList[0]['fileName']
print ('Do Not Cut File {}'.format(fileName))
# 创建协程
loop = asyncio.get_event_loop()
# 创建协程下载任务
tasks = [ asyncio.ensure_future(self.downloadOne(self.url, bar)) ]
loop.run_until_complete(asyncio.wait(tasks))
# 关闭协程
loop.close()
check = tools.mergeFile(self.downPath,self.fileName)
if check == 1:
isFine = tools.checkFileIntegrity(self.fileSize, self.downPath + '/' + self.fileName)
return
# 切割文件、分块下载
def cutFile(self, bar):
print ('Cut the {} to download'.format(self.fileName))
# 创建协程
loop = asyncio.get_event_loop()
# 创建session
session = loop.run_until_complete(self.createSession())
# 创建协程下载任务
tasks = [ asyncio.ensure_future(self.downloadMuti(meta, session, bar)) for meta in self.urlList ]
loop.run_until_complete(asyncio.wait(tasks))
# 要手动关闭自己创建的ClientSession,并且client.close()是个协程,得用事件循环关闭
loop.run_until_complete(session.close())
# 关闭协程
loop.close()
check = tools.mergeFile(self.downPath,self.fileName)
if check == 1:
isFine = tools.checkFileIntegrity(self.fileSize, self.downPath + '/' + self.fileName)
return
def run(self):
with tqdm(total=self.fileSize, unit='', desc=f'下载:{self.fileName}', unit_divisor=1024, ascii=True,
unit_scale=True) as bar:
if self.canCut == 0:
self.doNotCut(bar)
if self.canCut == 1:
self.cutFile(bar)
测试用代码
if __name__ == '__main__':
# url = 'http://wppkg.baidupcs.com/issue/netdisk/MACguanjia/BaiduNetdisk_mac_3.4.1.dmg'
url1 = 'https://dldir1.qq.com/wework/work_weixin/WeCom_3.1.1.3006.exe'
a = downLoader(url1)
a.run()
后记
做这个东西有点虎头蛇尾,但是不想这个坑一直挂在心里所以想尽快填上。在填坑过程中也遇到了不少坑,例如:ftp文件下载,突然找不到可以测试ftp下载的资源了....导致我ftp那部分没办法做验证
但是,这个过程做的很开心
网友评论