美文网首页
关于下载文件[只做记录,如有问题请斧正]

关于下载文件[只做记录,如有问题请斧正]

作者: 折花载酒z | 来源:发表于2019-03-15 09:55 被阅读0次

    具体需求:从网站下载文件保存文件名,本地路径,[由于去重策略使用下载url的id进行去重]

    scrapy download: 文件的时候无法将本地路径存储到mysql数据库
    wget:在现在文件的时候存储的文件名只是UrlEncode编码之后的一个字符串,很可能重复;
    就像这种:%e8%bf%99%e6%98%af%e4%b8%80%e4%b8%aa%e6%96%87%e4%bb%b6%e5%90%8d.pdf
    urlretrieve: 引入 --> from urllib.request import urlretrieve,可能由于库比较古老,在下载大文件的时候会出现死掉的情况,虽然参考了网上大佬的代码,使用socket去避免,但还是效果不好;
    代码:
    //downloadfiles是一个文件下载地址和文件名对应的字典
    if download_files == {}:
    return
    file_names = []
    try:
    for d in download_files.keys():
    # print(d)
    if 'ftp' in d:
    continue
    # print(d)
    temp = requests.get(d, headers=cls.headers)
    if 'Content-Disposition' in temp.headers.keys():
    suffix = unquote(
    temp.headers['Content-Disposition'].split('=')[-1]).split('.')[-1]
    file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
    urlretrieve(d, file_title)
    file_names.append(file_title)
    elif len(d.split('.')[-1]) < 5:
    if d.split('=')[-1] == d:
    file_title = FILE_STORE + d.split('/')[-1]
    urlretrieve(d, file_title)
    file_names.append(file_title)
    continue
    suffix = '.' + d.split('.')[-1]
    file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
    urlretrieve(d, file_title)
    file_names.append(file_title)
    cls.logger.debug(
    'elif_download_files: {},file_title: {}'.format(
    download_files, file_title),)

                else:
                    res = requests.get(d,headers=cls.headers)
                    if '系统出现异常' in res.text:
                        cls.logger.debug('else_if_err: {}'.format(res.text))
                        continue
                    # print(file_title)
                    # os.system("pause")
                    suffix = '.' + d.split('.')[-1]
                    file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                    urlretrieve(d, file_title)
                    file_names.append(file_title)
                    cls.logger.debug(
                        'else_download_files: {},file_title: {}'.format(
                            download_files, file_title),)
            return file_names
        except socket.timeout:
            count = 1
            while count <= 3:
                try:
                    for d in download_files.keys():
                        if 'ftp' in d:
                            continue
                        temp = requests.get(d)
                        if 'Content-Disposition' in temp.headers.keys():
                            # print(temp.headers['Content-Disposition'].split('=')[-1])
                            file_title = unquote(
                                temp.headers['Content-Disposition'].split('=')[-1])
                            file_title = FILE_STORE + file_title
                            print(file_title)
                            urlretrieve(d, file_title)
                            file_names.append(file_title)
                        elif len(d.split('.')[-1]) < 5:
                            if d.split('=')[-1] == d:
                                file_title = FILE_STORE + d.split('/')[-1]
                                urlretrieve(d, file_title)
                                file_names.append(file_title)
                                continue
                            suffix = '.' + d.split('.')[-1]
                            file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                            urlretrieve(d, file_title)
                            file_names.append(file_title)
                            cls.logger.debug(
                                'elif_download_files: {},file_title: {}'.format(
                                    download_files, file_title), )
                        else:
                            res = requests.get(d, headers=cls.headers)
                            if '系统出现异常' in res.text:
                                cls.logger.debug('else_if_err: {}'.format(res.text))
                                continue
                            suffix = '.' + d.split('.')[-1]
                            file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                            urlretrieve(d, file_title)
                            file_names.append(file_title)
                            cls.logger.debug(
                                'else_download_files: {},file_title: {}'.format(
                                    download_files, file_title), )
                    return file_names
    
                except socket.timeout:
                    err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
                    print(err_info)
                    count += 1
            if count > 5:
                print("downloading  fialed!")
    

    requests:在下载文件的时候需要制定stream参数,我这里指定的是一次1024k;
    代码:
    file_names = []
    try:
    for d in download_files.keys():
    if 'ftp' in d:
    continue
    temp = requests.get(d, headers=cls.headers)
    if 'Content-Disposition' in temp.headers.keys():
    suffix = unquote(temp.headers['Content-Disposition'].split('=')[-1]).split('.')[-1]
    file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
    with closing(requests.get(
    url=d,
    verify=False, stream=True)) as res:
    # file_title = FILE_STORE + file_title
    with open(file_title, 'wb') as fd:
    print('下载新的……')
    for chunk in res.iter_content(chunk_size=1024):
    if chunk:
    fd.write(chunk)
    file_names.append(file_title)
    elif len(d.split('.')[-1]) < 5:
    if d.split('=')[-1] == d:
    file_title = FILE_STORE + d.split('/')[-1]
    with closing(requests.get(
    url=d,
    verify=False, stream=True)) as res:
    # file_title = FILE_STORE + file_title
    with open(file_title, 'wb') as fd:
    print('下载新的……')
    for chunk in res.iter_content(chunk_size=1024):
    if chunk:
    fd.write(chunk)
    file_names.append(file_title)
    continue
    suffix = '.' + d.split('.')[-1]
    file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
    with closing(requests.get(
    url=d,
    verify=False, stream=True)) as res:
    # file_title = FILE_STORE + file_title
    with open(file_title, 'wb') as fd:
    print('下载新的……')
    for chunk in res.iter_content(chunk_size=1024):
    if chunk:
    fd.write(chunk)
    file_names.append(file_title)
    cls.logger.debug(
    'elif_download_files: {},file_title: {}'.format(
    download_files, file_title),)

                else:
                    res = requests.get(d,headers=cls.headers)
                    if '系统出现异常' in res.text:
                        cls.logger.debug('else_if_err: {}'.format(res.text))
                        continue
                    # print(file_title)
                    # os.system("pause")
                    suffix = '.' + d.split('.')[-1]
                    file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                    with closing(requests.get(
                            url=d,
                            verify=False, stream=True)) as res:
                        # file_title = FILE_STORE + file_title
                        with open(file_title, 'wb') as fd:
                            print('下载新的……')
                            for chunk in res.iter_content(chunk_size=1024):
                                if chunk:
                                    fd.write(chunk)
                        file_names.append(file_title)
                    cls.logger.debug(
                        'else_download_files: {},file_title: {}'.format(
                            download_files, file_title),)
            return file_names
        except Exception as e:
            print(e)
    

    相关文章

      网友评论

          本文标题:关于下载文件[只做记录,如有问题请斧正]

          本文链接:https://www.haomeiwen.com/subject/yxnkmqtx.html