lang/python/ FindDuplicateFiles
Usage
fdup root root2 ...
where root
, root2
, etc. are directories to search recursively from.
Strategy
First, we compile a list of files by file size, the rationale being that if two files sizes differ, they cannot be equal. From this we compile a list of candidates for the next step.
Next, we compile a list of files, by the sha256 hash of the first 64k – this is far quicker than hashing an entire file, and is intended to root out most non-duplicate files, since if files differ, they often differ within the first 64k.
Then, following similar reasoning, we hash the first 1 megabyte of potential duplicate candidates.
Finally for those files we have not determined are not unique, we hash the entire file, so as to be certain. (We could have compared byte by byte instead at this stage.)
The program, as written, writes out the inode of the file, so that we can tell apart the case of two hard links pointing to the same file, or to two different files.
Source
#!/usr/bin/python
import sys, os, os.path
import hashlib
from collections import defaultdict
quick = True
def doprint(*xs,**kw):
if "file" in kw:
f = kw['file']
else:
f = sys.stdout
print(*xs,**kw)
f.flush()
def hash_first(filename,chunk_size=64*1024):
doprint(f"# Hashing first {chunk_size} of {filename}")
with open(filename,"rb") as f:
bytes = f.read(chunk_size)
hash = hashlib.sha256(bytes).hexdigest()
return hash
def hash_all(filename,chunk_size=1024*1024):
doprint(f"# Hashing all of {filename}")
sz = os.path.getsize(filename)
x = 0
sha = hashlib.sha256()
i = 0
with open(filename,"rb") as f:
while byte_block := f.read(chunk_size):
doprint("#",end="")
sha.update(byte_block)
x += len(byte_block)
i += 1
if i >= 30:
pc = 100*(x/sz)
doprint(f"# {pc:0.3f}%")
i = 0
doprint()
return sha.hexdigest()
#roots = sys.argv[1:]
roots = []
for x in sys.argv[1:]:
if x == "-q":
quick = True
elif x == "-Q":
quick = False
else:
roots.append(x)
class T:
def __init__(self,t=10):
self.t = t
def __call__(self):
self.t -= 1
if self.t <= 0:
doprint(f"Exiting")
sys.exit(0)
# t = T(1000)
# pass 1: compile by_size
doprint("# Finding files")
by_size = defaultdict(list)
for root in roots:
for rt, dirs, files in os.walk(root):
for f in files:
doprint(".",end="")
path = os.path.join(rt,f)
sz = os.path.getsize(path)
by_size[sz].append(path)
doprint("# Done finding files")
candidates = []
for sz,fs in by_size.items():
if len(fs) > 1:
candidates += fs
doprint(f"# {len(candidates)} candidates by size")
if len(candidates) == 0:
exit(0)
# pass 2: compile by_hash64k
by_hash64k = defaultdict(list)
for c in candidates:
h = hash_first(c,64*1024)
by_hash64k[h].append(c)
candidates = []
for h,fs in by_hash64k.items():
if len(fs) > 1:
candidates += fs
doprint(f"# {len(candidates)} candidates by hash 64k")
if len(candidates) == 0:
exit(0)
# pass 3: compilie by_hash1m
by_hash1m = defaultdict(list)
for c in candidates:
h = hash_first(c,1024*1024)
by_hash1m[h].append(c)
candidates = []
for h,fs in by_hash1m.items():
if len(fs) > 1:
candidates += fs
doprint(f"# {len(candidates)} candidates by hash 1M")
if len(candidates) == 0:
exit(0)
if quick:
dups = False
for h,fs in by_hash1m.items():
if len(fs) > 1:
if not dups:
dups = True
doprint(f"Dups:\n=====\n\n")
doprint(f"hash {h}:")
for f in fs:
doprint(f" {f}")
else:
# pass 4: compile by hashall
by_hashall = defaultdict(list)
dups = False
for c in candidates:
h = hash_all(c)
by_hashall[h].append(c)
for h,fs in by_hashall.items():
if len(fs) > 1:
if not dups:
dups = True
doprint(f"Dups:\n=====\n\n")
doprint(f"hash {h}:")
for f in fs:
doprint(f" {f}")
Minimalist version
The simplest example of a duplicate file finder is just to take the hash of every file. This is a large performance hit in the case of non-unique files (the version above first looks for files of the same size, then compares just the first 64k to weed out different files of the same size, then compares just the first 1M of any remaining candidate duplicate files, and only then resorts to hashing the entire file). But the code below shows how it can be done in only 13 lines of python, rather than just under 100.
#!/usr/bin/env python3
import hashlib,os,sys,collections
by_hash = collections.defaultdict(list)
hash_file = lambda filename: hashlib.sha256(open(filename,"rb").read()).hexdigest()
for arg in sys.argv[1:]:
for root,dirs,files in os.walk(arg):
for filename in files:
path = f"{root}/{filename}"
by_hash[hash_file(path)].append(path)
for hashvalue,filenames in by_hash.items():
if len(filenames) > 1:
print(hashvalue)
for filename in filenames:
print(f" {filename}")
or alternatively just to hash the first 64k, which may give false positives, change the hash_file =
line to
hash_file = lambda filename: hashlib.sha256(open(filename,"rb").read(64*1024)).hexdigest()
basically the more of the file you hash, the more likely it will detect differences. For things like multi-gigabyte video files, if the two files are not identical, then likely at least one byte in the first 64k will differ.