美文网首页
爬取名称、md5值、下载链接,批量下载升级包,计算文件MD5值

爬取名称、md5值、下载链接,批量下载升级包,计算文件MD5值

作者: 闭眼就能看得见 | 来源:发表于2019-03-28 22:36 被阅读0次

    今日学习

    功能介绍
    • 计算指定文件的md5值
    • 爬取绿盟升级包名称和md5值,并生成excle
    • 计算指定文件夹里的所有文件的md5值并与官网上爬取的绿盟升级包名称和md5值做比较
    • 自动下载同一类型(增量更新)升级包到指定文件夹
    import requests
    from lxml import etree
    import os
    import hashlib
    import getopt
    import sys
    import xlwt
    url = ''
    path = ''
    # 爬取网页上升级包的名称和md5值,放到字典dic1中
    def spider_filename_md5(url):
        response = requests.get(url)
        html = response.content
        html = str(html, encoding='utf-8')
        s = etree.HTML(html)
        name = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr/td/a')
        name = [str(i.text) for i in name]
        md5 = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr[2]/td[1]/text()')
        md5 = [str(i.strip()) for i in md5]
        dic1 = dict(zip(name, md5))
        excel(dic1)
        return dic1
    
    def excel(dic):
        # 创建workbook(其实就是excel,后来保存一下就行)
        workbook = xlwt.Workbook(encoding='utf-8')
        # 创建表
        worksheet = workbook.add_sheet('sheet1')
        # 往单元格内写入内容:写入表头
        worksheet.write(0, 0, label="name")
        worksheet.write(0, 1, label="md5")
        # 往单元格内写入内容:写入内容
        i = 1
        for word in dic:
            worksheet.write(i, 0, label=word)
            worksheet.write(i, 1, label=dic[word])
            i = i + 1
        workbook.save('md5.xls')
    # 计算文件md5方法
    def md5(path,Bytes=1024):
        md5_1 = hashlib.md5()
        with open(path,'rb') as f:
            while 1:
                data =f.read(Bytes)
                if data:
                    md5_1.update(data)
                else:
                    break
        ret = md5_1.hexdigest()
        return ret
    
    # 取出指定文件夹中升级包的名称和md5值,放到字典dic1中
    def local_filename_md5(path):
        name = [file for file in os.listdir(path)]
        dic2 = {}
        for i in name:
            dis={i:md5(path+'\\'+i)}
            dic2.update(dis)
        return dic2
    # 本地文件的md5与正确文件的md5比对
    def duibi(dic1,dic2):
        for i in dic2.keys():
            # print(i)
            # print(dic1.keys())
            if i in dic1.keys():
                if dic2[i] == dic1[i]:
                  #  print(i+'文件正确')
                    pass
                else:
                    print(i+':md5值不正确')
            else:
                 print('目标网址无此本地文件:'+i)
    
    # 自动从官网下载补丁包(增量更新)
    def down_file(url):
        response = requests.get(url)
        html = response.content
        html = str(html, encoding='utf-8')
        s = etree.HTML(html)
        down_path = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr[1]/td[1]/a/@href')
        down_path = [str('http://update.nsfocus.com' + i) for i in down_path]
        name = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/table/tr/td/a')
        name = [str(i.text) for i in name]
        dic3 = dict(zip(down_path, name))
        file_folder_name = s.xpath('/html/body/section/div/section/div/div[2]/div[2]/h2')
        file_folder_name = [str(i.text) for i in file_folder_name]
        headers = {
                'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                'Accept-Encoding':'gzip, deflate',
                'Connection':'close',
                'Upgrade-Insecure-Requests':'1',
            }
        for i in down_path:
            download = requests.get(i,headers=headers)
            path = os.path.abspath('.')
            name_path1 = path+'\\'+str(file_folder_name[0])
            name_path2 = name_path1 +'\\'+str(dic3[i])
            if not os.path.exists(name_path1):
                os.mkdir(name_path1)
            else:
                if not os.path.exists(name_path2):
                    with open(name_path2, 'wb') as f:
                        f.write(download.content)
                else:
                    pass
    
    def use():
        print("helpinfo:")
        print("获取指定文件的md5值。格式:-m -p 文件路径包含文件名字;例子 python md5.py -m -p c:\\a\\p.bat")
        print("爬取指定网页升级包名称和md5值,并生成md5.excel。格式:-n -u url ;例子 python md5.py -n -u http://*.*.*.* ")
        print("获取指定文件夹内所有文件的md5值,并与官网比较。格式:-l -u url -s 文件夹目录 ")
        print("下载指定网页的升级包(增量更新) 格式:-d -u url")
    opts,args = getopt.getopt(sys.argv[1:], "hldmnsp:u:")
    if __name__ == '__main__':
        listen = None
        for a,b in opts:
            if a == "-h":
                use()
            elif a == "-m":
                listen = 1
            elif a == "-n":
                listen = 2
            elif a == "-l":
                listen = 3
            elif a == "-p":
                path = b
            elif a == "-u":
                url = b
            elif a == "-d":
                listen = 4
            else:
                assert False, "Unhandled Option"
        if listen == 1:
            print(md5(path))
        elif listen == 2:
            spider_filename_md5(url)
        elif listen == 3:
            duibi(spider_filename_md5(url),local_filename_md5(path))
        elif listen == 4:
            down_file(url)
        else:
            pass
    

    相关文章

      网友评论

          本文标题:爬取名称、md5值、下载链接,批量下载升级包,计算文件MD5值

          本文链接:https://www.haomeiwen.com/subject/dmyhbqtx.html