美文网首页
【代码阅读】Deep-Learning-Papers-Readi

【代码阅读】Deep-Learning-Papers-Readi

作者: Joyner2018 | 来源:发表于2021-10-25 00:49 被阅读0次

代码地址:

https://github.com/floodsung/Deep-Learning-Papers-Reading-Roadmap

download.py的内容如下

from __future__ import print_function
import os
import re
from six.moves.urllib.error import HTTPError
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup
import socket
import time
import requests

# encoding=utf8  
import sys  
try:
    reload(sys)
except NameError:
    pass
try:
    sys.setdefaultencoding('utf8')
except AttributeError:
    pass

def download_pdf(link, location, name):
    try:
        response = requests.get(link)
        with open(os.path.join(location, name), 'wb') as f:
            f.write(response.content)
            f.close()
    except HTTPError:
        print('>>> Error 404: cannot be downloaded!\n') 
        raise   
    except socket.timeout:
        print(" ".join(("can't download", link, "due to connection timeout!")) )
        raise

def clean_pdf_link(link):
    if 'arxiv' in link:
        link = link.replace('abs', 'pdf')   
        if not(link.endswith('.pdf')):
            link = '.'.join((link, 'pdf'))

    print(link)
    return link

def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
    for key, rep in replacements.items():
        text = text.replace(key, rep)
    return text    

def print_title(title, pattern = "-"):
    print('\n'.join(("", title, pattern * len(title)))) 

def get_extension(link):
    extension = os.path.splitext(link)[1][1:]
    if extension in ['pdf', 'html']:
        return extension
    if 'pdf' in extension:
        return 'pdf'    
    return 'pdf'    

def shorten_title(title):
    m1 = re.search('[[0-9]*]', title)
    m2 = re.search('".*"', title)
    if m1:
        title = m1.group(0)
    if m2:
        title = ' '.join((title, m2.group(0)))   
    return title[:50] + ' [...]'    

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description = 'Download all the PDF/HTML links into README.md')
    parser.add_argument('-d', action="store", dest="directory")
    parser.add_argument('--no-html', action="store_true", dest="nohtml", default = False)
    parser.add_argument('--overwrite', action="store_true", default = False)    
    results = parser.parse_args()

    output_directory = 'pdfs' if results.directory is None else results.directory

    forbidden_extensions = ['html', 'htm'] if results.nohtml else []

    if results.overwrite and os.path.exists(output_directory):
        shutil.rmtree(output_directory)

    with open('README.md',encoding='utf8) as readme:
        readme_html = mistune.markdown(readme.read())
        readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")

    point = readme_soup.find_all('h1')[1]

    failures = []
    while point is not None:
        if point.name:
            if re.search('h[1-2]', point.name):
                if point.name == 'h1':
                    h1_directory = os.path.join(output_directory, clean_text(point.text))
                    current_directory = h1_directory
                elif point.name == 'h2':
                    current_directory = os.path.join(h1_directory, clean_text(point.text))  
                if not os.path.exists(current_directory):
                    os.makedirs(current_directory)
                print_title(point.text)

            if point.name == 'p':
                link = point.find('a')
                if link is not None:
                    link = clean_pdf_link(link.attrs['href'])
                    ext = get_extension(link)
                    print(ext)
                    if not ext in forbidden_extensions:
                        print(shorten_title(point.text) + ' (' + link + ')')
                        try:
                            name = clean_text(point.text.split('[' + ext + ']')[0])
                            fullname = '.'.join((name, ext))
                            if not os.path.exists('/'.join((current_directory, fullname)) ):
                                download_pdf(link, current_directory, '.'.join((name, ext)))
                        except KeyboardInterrupt:
                            try:
                                print("Press Ctrl-C in 1 second to quit")
                                time.sleep(1)
                            except KeyboardInterrupt:
                                print("Cancelling..")
                                break
                        except:
                            failures.append(point.text)

        point = point.next_sibling          

    print('Done!')
    if failures:
        print('Some downloads have failed:')
        for fail in failures:
            print('> ' + fail)</pre>

代码阅读

  1. markdown 文件下载的流程
  • markdown文件转成html文件,利用包是mistune.markdown
  • BeautifulSoup解析文件得到文件的地址
  • 采用point.next_sibling遍历兄弟节点下载文件。
  1. BeautifulSoup解析得到的文件的操作包括
readme_soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
point = readme_soup.find_all('h1')[1]
print(point)
print(point.name)
print(point.text)
print(point.find('a'))
point.find('a').attrs['href']
point=point.next_sibling</pre>

实践

利用上面学到遍历方法,实现下载ICCV2021的文章

  • 遍历找到节点
  • 分析其中的pdf结尾链接
  • 利用wget下载
  • 下载到当前目录
from __future__ import print_function
import os
import re
from six.moves.urllib.error import HTTPError
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup
import socket
import time
import requests

# encoding=utf8
import sys
try:
    reload(sys)
except NameError:
    pass
try:
    sys.setdefaultencoding('utf8')
except AttributeError:
    pass

def get_content(url):
    import sys
    info = sys.version_info
    if info[0] == 2: #python2
        import urllib2
        header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:48.0) Gecko/20100101 Firefox/48.0"}
        request = urllib2.Request(url=url,headers=header)
        response = urllib2.urlopen(request)
        content = response.read()
        return content
    elif info[0] == 3: # python3
        import requests
        req = requests.get(url=url)
        # print(req.text)
        return req.text
    else: # python版本信息不可知
        raise("python info not found.")

def main():
    url = "https://openaccess.thecvf.com/ICCV2021?day=all"
    readme_html = get_content(url)
    soup = BeautifulSoup.BeautifulSoup(readme_html, "html.parser")
    point = soup.find_all('dt')[0]
    i=0
    while point:
        try:
            for a in point.find_all('a'):
                href = a.attrs['href']
                if href.endswith(".pdf"):
                    url="https://openaccess.thecvf.com"+href
                    cmd = "wget -c "+url
                    os.system(cmd)
        except:
            pass
        point=point.next_sibling

if __name__=="__main__":
    main()

相关文章

  • 【代码阅读】Deep-Learning-Papers-Readi

    代码地址: https://github.com/floodsung/Deep-Learning-Papers-R...

  • 阅读代码

    2018-05-06 阅读代码 通读代码,做好标记: 函数以及函数的功能。 每个变量的初始赋值。 每个在程序的各个...

  • 阅读代码

    今天想准备3D实验的训练数据集,但是之前看过的代码已经记不清了。这样的问题一直存在,别人的代码啃过之后以为懂了,过...

  • 阅读代码

    看代码是很让人头大的事情,尤其是篇幅很长,依赖关系复杂的时候。一般来说,编译器会通过各种方式理清代码的逻辑,比如不...

  • Mantle 源代码阅读笔记 一

    Mantle 源代码阅读笔记 一 Mantle 源代码阅读笔记 一

  • 阅读散记(1)

    《代码阅读方法与实践》(中文版)——第一章 导论 为什么要阅读代码? 将代码作为文献 了解代码的工作方式 维护代码...

  • Vue2源码阅读(一):响应式原理

    一、阅读准备 阅读Vue.js代码前,需要准备: 仓库代码,方便加注释和多段关键代码可以同时阅读 打包后未压缩的代...

  • flag 代码阅读

    读 mapillary 的 street viewer 代码。 读 sfm pipeline 的 viewer 代...

  • 阅读代码之我见

    阅读代码之我见 学习编程离不开读源码阶段,阅读源码,你会发现很多新的架构包括库,积累技术要点,优化自己的书写代码模...

  • SDWebImage代码阅读

    SDWebImageManager 负责管理图片的检查、下载,以及对这些操作进行管理。 SDImageCache ...

网友评论

      本文标题:【代码阅读】Deep-Learning-Papers-Readi

      本文链接:https://www.haomeiwen.com/subject/onoraltx.html