en....
总的来说,就是把这个网页上的我想要的文件先从html里过滤出来,再下载。
其中访问需要cookie,不能通过get直接访问;
脚本如下
#!/usr/bin/python3
import sys, io, re, os
from urllib import request
def get_download_url(sub_url):
url_key_pattern = re.compile(r"href=\".*\"")
url_key = url_key_pattern.findall(str(sub_url))
url_key_pattern = re.compile(r"\/redmine.*\"")
url_key = url_key_pattern.findall(str(url_key))
sub_url = str(url_key[0])
sub_url = sub_url.split('"')
sub_url = str(sub_url[0])
download_url = "http://redmine.springgroup.cn" + sub_url
return download_url
def get_file_name(sub_url):
url_key_pattern = re.compile(r"href=\".*")
url_key = url_key_pattern.findall(str(sub_url))
this_file_name = str(url_key[0])
this_key_pattern = re.compile(r"\>.*?\<")
this_file_name = this_key_pattern.findall(str(this_file_name))
this_file_name = str(this_file_name[0])
this_file_name = this_file_name.strip('>,<')
return this_file_name
def pre_fun(save_dir, redmine_number):
#浏览器登录后得到的cookie,也就是刚才复制的字符串
#cookie_str = r'JSESSIONID=xxxxxxxxxxxxxxxxxxxxxx; iPlanetDirectoryPro=xxxxxxxxxxxxxxxxxx'
cookie_str = r'_redmine_session=ajhuOC9xbG9NaWlyUjJ4RTBzcDF4cjl1SVVzUlF4V1dURitCQ2x1U0FpQ1kva1ZrM1ppZ3FDTjVXbnNkdlNHSld3WCt4UjVIYlFBcFhMd29mTVdTc290ZGk5WGRERzl0RmR6V3VubFMxQkF1VGQvQlVGcHdEZWhkMTJFMzNGbVdQSlhYcnJldG8'
os.chdir(save_dir)
if not os.path.isdir(redmine_number):
os.makedirs(redmine_number)
os.chdir(redmine_number)
#登录后才能访问的网页
redmine_url = 'http://redmine.springgroup.cn/redmine/issues/' + redmine_number
return redmine_url, cookie_str
def start_download(redmine_url, cookie_str):
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
req = request.Request(redmine_url)
#设置cookie
req.add_header('cookie', cookie_str)
#设置请求头
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
resp = request.urlopen(req)
#print(resp.read().decode('utf-8'))
#this_text = resp.read().decode('utf-8')
while resp:
li = resp.readline().decode('utf-8')
if "</html>" in li:
break
# 匹配文件名 和 下载 url
if ("数据库" not in li and "已添加" not in li and "手册" not in li and ".htm" not in li)\
and ("rar" in li or "zip" in li or "tgz" in li or "tar" in li)\
and ("href" in li and "download" in li):
# 文件名
file_name = get_file_name(li)
# 下载链接
download_url = get_download_url(li)
print("开始下载 %s ..." % (file_name))
req1 = request.Request(download_url)
#设置cookie
req1.add_header('cookie', cookie_str)
#设置请求头
req1.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
this_file = request.urlopen(req1)
data = this_file.read()
with open(file_name, "wb") as code:
code.write(data)
print("ok\n")
if __name__ == '__main__':
#redmine号码
redmine_number = '346683'
#下载存放路径
save_dir = "/home/xxwdll/soft/nginx/download/tmp"
redmine_url, cookie_str = pre_fun(save_dir, redmine_number)
#开始下载
start_download(redmine_url, cookie_str)
结果如下
网友评论