下载cvpr2020的文章

运行环境

ubuntu 16.04

安装的包

pip install bs4 --user
pip install youtube-dl --user

爬虫下载的代码

#encoding=utf-8
import sys
import os
from bs4 import BeautifulSoup

def get_content(url):
    import sys
    info = sys.version_info
    if info[0] == 2: #python2
        import urllib2
        header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:48.0) Gecko/20100101 Firefox/48.0"}
        request = urllib2.Request(url=url,headers=header)
        response = urllib2.urlopen(request)
        content = response.read()
        return content
    elif info[0] == 3: # python3
        import requests
        req = requests.get(url=url)
        # print(req.text)    
        return req.text         
    else: # python版本信息不可知
        raise("python info not found.")

def get_a_flags(html): #
    soup = BeautifulSoup(html,"html.parser")
    a_falgs = soup.find_all("a")
    return a_falgs

def get_a_hrefs(a_falgs):
    a_hrefs = []
    for a_falg in a_falgs:
        if a_falg.get("href"):
            a_hrefs.append(a_falg["href"])
    return a_hrefs

def download(a_hrefs):
    if not isinstance(a_hrefs, list):
        download([a_hrefs])
    else:
        import platform
        sysstr = platform.system()
        if sysstr =="Windows":
            for href in a_hrefs:
                cmd = "youtube-dl "+ href
                os.system(cmd)
        elif sysstr =="Linux":
            for href in a_hrefs:
                cmd = "youtube-dl "+ href
                os.system(cmd)
        else:
            print("Other System tasks")

def main(url):
    content = get_content(url)
    a_falgs = get_a_flags(content)
    a_hrefs = get_a_hrefs(a_falgs)
    a_hrefs = ["http://openaccess.thecvf.com/"+a for a in a_hrefs if ".pdf" in a]
    download(a_hrefs)
    print("end!")

if __name__=="__main__":
    main("http://openaccess.thecvf.com/CVPR2020.py")