目的:
用官网的方法同步下载太慢,于是直接将下载链接都爬取下来,然后用迅雷下载
import requests
from pyquery import PyQuery as pq
import time
import re
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
'Connection': 'keep-alive',
'Host': 'bioinfo.life.hust.edu.cn',
'Referer': 'http://bioinfo.life.hust.edu.cn/AnimalTFDB/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
url='http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/'
res = requests.get(url=url, headers=headers)
print(res.status_code)
jpy = pq(res.text)
items = jpy('body > pre:nth-child(2) > a').items()
urls = list()
for item in items:
url = item.attr('href')
url = 'http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/' + url + '\n'
print(url)
urls.append(url)
with open('./tools_urls.txt', 'w') as f:
f.writelines(urls)
网友评论