文件的唯一标识MD5
假如你要处理的重复文件有不同的文件名,最简单的办法就是通过MD5来确定两个文件是不是一样的。
def md5sum(filename, blocksize=65536):
hash = hashlib.md5()
with open(filename, "rb") as f:
for block in iter(lambda: f.read(blocksize), b""):
hash.update(block)
return hash.hexdigest()
这个方法可以快速获得一个文件的MD5值,blocksize 可以根据文件大小和CPU性能调整,一般选择的值约等于文件的平均大小
利用hash遍历和字典来保存所有文件的MD5值
利用hash遍历的思想,借助于python的dict技巧,这部分较为简单
def build_dup_dict(dir_path):
def save(file):
hash = md5sum(file)
if hash not in dup.keys():
dup[hash] = [file]
else:
dup[hash].append(file)
for idx,path in enumerate(glob.glob(dir_path+"*.jpg")):
if idx%100==0:
print(idx)
save(path)
保存图片
这部分没什么好说的,简单。
def main():
build_dup_dict(photo_path)
import shutil
import os
for key in dup.keys():
path=dup[key][0]
name=os.path.basename(path)
shutil.copyfile(path,save_path+name)
完整代码
import hashlib
from pathlib import Path
import glob
dup = {}
photo_path = 'F:/Glasses data/finalresult/'
save_path="F:/Glasses data/final/"
def md5sum(filename, blocksize=65536):
hash = hashlib.md5()
with open(filename, "rb") as f:
for block in iter(lambda: f.read(blocksize), b""):
hash.update(block)
return hash.hexdigest()
def build_dup_dict(dir_path):
def save(file):
hash = md5sum(file)
if hash not in dup.keys():
dup[hash] = [file]
else:
dup[hash].append(file)
for idx,path in enumerate(glob.glob(dir_path+"*.jpg")):
if idx%100==0:
print(idx)
save(path)
def main():
build_dup_dict(photo_path)
import shutil
import os
for key in dup.keys():
path=dup[key][0]
name=os.path.basename(path)
shutil.copyfile(path,save_path+name)
if __name__ =="__main__":
main()
网友评论