代码地址:
https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap
download.py的内容如下
from __future__ import print_function
import os
import re
from six.moves.urllib.error import HTTPError
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup
import socket
import time
import requests
# encoding=utf8
import sys
try:
reload(sys)
except NameError:
pass
try:
sys.setdefaultencoding('utf8')
except AttributeError:
pass
def download_pdf(link, location, name):
try:
response = requests.get(link)
with open(os.path.join(location, name), 'wb') as f:
f.write(response.content)
f.close()
except HTTPError:
print('>>> Error 404: cannot be downloaded!\n')
raise
except socket.timeout:
print(" ".join(("can't download", link, "due to connection timeout!")) )
raise
def clean_pdf_link(link):
if 'arxiv' in link:
link = link.replace('abs', 'pdf')
if not(link.endswith('.pdf')):
link = '.'.join((link, 'pdf'))
print(link)
return link
def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
for key, rep in replacements.items():
text = text.replace(key, rep)
return text
def print_title(title, pattern = "-"):
print('\n'.join(("", title, pattern * len(title))))
def get_extension(link):
extension = os.path.splitext(link)[1][1:]
if extension in ['pdf', 'html']:
return extension
if 'pdf' in extension:
return 'pdf'
return 'pdf'
def shorten_title(title):
m1 = re.search('[[0-9]*]', title)
m2 = re.search('".*"', title)
if m1:
title = m1.group(0)
if m2:
title = ' '.join((title, m2.group(0)))
return title[:50] + ' [...]'
if __name__ == '__main__':
parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md')
parser.add_argument('-d', action="store", dest="directory")
parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False)
parser.add_argument('--overwrite', action="store_true", default = False)
results = parser.parse_args()
output_directory = 'pdfs' if results.directory is None else results.directory
forbidden_extensions = ['html', 'htm'] if results.nohtml else []
if results.overwrite and os.path.exists(output_directory):
shutil.rmtree(output_directory)
with open('README.md',encoding='utf8) as readme:
readme_html = mistune.markdown(readme.read())
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
point = readme_soup.find_all('h1')[1]
failures = []
while point is not None:
if point.name:
if re.search('h[1-2]', point.name):
if point.name == 'h1':
h1_directory = os.path.join(output_directory, clean_text(point.text))
current_directory = h1_directory
elif point.name == 'h2':
current_directory = os.path.join(h1_directory, clean_text(point.text))
if not os.path.exists(current_directory):
os.makedirs(current_directory)
print_title(point.text)
if point.name == 'p':
link = point.find('a')
if link is not None:
link = clean_pdf_link(link.attrs['href'])
ext = get_extension(link)
print(ext)
if not ext in forbidden_extensions:
print(shorten_title(point.text) + ' (' + link + ')')
try:
name = clean_text(point.text.split('[' + ext + ']')[0])
fullname = '.'.join((name, ext))
if not os.path.exists('/'.join((current_directory, fullname)) ):
download_pdf(link, current_directory, '.'.join((name, ext)))
except KeyboardInterrupt:
try:
print("Press Ctrl-C in 1 second to quit")
time.sleep(1)
except KeyboardInterrupt:
print("Cancelling..")
break
except:
failures.append(point.text)
point = point.next_sibling
print('Done!')
if failures:
print('Some downloads have failed:')
for fail in failures:
print('> ' + fail)</pre>
代码阅读
- markdown 文件下载的流程
- markdown文件转成html文件,利用包是mistune.markdown
- BeautifulSoup解析文件得到文件的地址
- 采用point.next_sibling遍历兄弟节点下载文件。
- BeautifulSoup解析得到的文件的操作包括
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
point = readme_soup.find_all('h1')[1]
print(point)
print(point.name)
print(point.text)
print(point.find('a'))
point.find('a').attrs['href']
point=point.next_sibling</pre>
实践
利用上面学到遍历方法,实现下载ICCV2021的文章
- 遍历找到节点
- 分析其中的pdf结尾链接
- 利用wget下载
- 下载到当前目录
from __future__ import print_function
import os
import re
from six.moves.urllib.error import HTTPError
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup
import socket
import time
import requests
# encoding=utf8
import sys
try:
reload(sys)
except NameError:
pass
try:
sys.setdefaultencoding('utf8')
except AttributeError:
pass
def get_content(url):
import sys
info = sys.version_info
if info[0] == 2: #python2
import urllib2
header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:48.0) Gecko/20100101 Firefox/48.0"}
request = urllib2.Request(url=url,headers=header)
response = urllib2.urlopen(request)
content = response.read()
return content
elif info[0] == 3: # python3
import requests
req = requests.get(url=url)
# print(req.text)
return req.text
else: # python版本信息不可知
raise("python info not found.")
def main():
url = "https://openaccess.thecvf.com/ICCV2021?day=all"
readme_html = get_content(url)
soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
point = soup.find_all('dt')[0]
i=0
while point:
try:
for a in point.find_all('a'):
href = a.attrs['href']
if href.endswith(".pdf"):
url="https://openaccess.thecvf.com"+href
cmd = "wget -c "+url
os.system(cmd)
except:
pass
point=point.next_sibling
if __name__=="__main__":
main()
网友评论