1.分析网站
target_url:http://www.dilidili.name/watch3/32061/
目标数据源:
千与千寻.png
2.开个多进程爬取资源
from multiprocessing import Pool
from functools import reduce
import requests
import os
# 安装python, 安装requests,就可以愉快的开车啦!
path = '千与千寻/'
start = 0
length = 1866
speed = 20
if not os.path.exists(path):
os.mkdir(path)
def get_content(iter):
for i in iter:
try:
if i < 10:
url = 'http://kbzy.zxziyuan-yun.com/20180404/XVXMBPEV/800kb/hls/2Tb76g334100{}.ts'.format(i)
elif i < 100:
url = 'http://kbzy.zxziyuan-yun.com/20180404/XVXMBPEV/800kb/hls/2Tb76g33410{}.ts'.format(i)
else:
url = 'http://kbzy.zxziyuan-yun.com/20180404/XVXMBPEV/800kb/hls/2Tb76g3341{}.ts'.format(i)
r = requests.get(url)
if b'<h1>\xb7\xfe\xce\xf1\xc6\xf7\xb4\xed\xce\xf3</h1>' not in r.content:
with open(path + '千与千寻{}.ts'.format(i), 'wb') as f:
f.write(r.content)
print('加载完成', i)
except:
print('下载完成...')
def open_file(x):
with open(x, 'rb') as f:
return f.read()
def start_task():
lst = list(range(start, length))
result = [lst[x:x + int((len(lst) / speed))] for x in range(len(lst)) if x % (len(lst) / speed) == 0]
pool = Pool(speed)
for target in result:
pool.apply_async(get_content, args=(target,))
pool.close()
pool.join()
print('下载完成...')
return True
if __name__ == '__main__':
if start_task():
# 视频拼接
lst = [path + '千与千寻{}.ts'.format(x) for x in range(length)]
source = list(map(open_file, lst))
z = reduce(lambda x, y: x + y, source)
with open('千与千寻.ts', 'wb') as f:
f.write(z)
优化可以把地址做个缓存池避免重复爬取,但我懒。。。
成果:http://59.110.157.193:8000/media/video/%E5%8D%83%E4%B8%8E%E5%8D%83%E5%AF%BB.ts
网友评论