lang/python/ FindDuplicateFiles


Usage

fdup root root2 ...

where root, root2, etc. are directories to search recursively from.

Strategy

First, we compile a list of files by file size, the rationale being that if two files sizes differ, they cannot be equal. From this we compile a list of candidates for the next step.

Next, we compile a list of files, by the sha256 hash of the first 64k – this is far quicker than hashing an entire file, and is intended to root out most non-duplicate files, since if files differ, they often differ within the first 64k.

Then, following similar reasoning, we hash the first 1 megabyte of potential duplicate candidates.

Finally for those files we have not determined are not unique, we hash the entire file, so as to be certain. (We could have compared byte by byte instead at this stage.)

The program, as written, writes out the inode of the file, so that we can tell apart the case of two hard links pointing to the same file, or to two different files.

Source

#!/usr/bin/python
import sys, os, os.path
import hashlib
from collections import defaultdict

quick = True

def doprint(*xs,**kw):
  if "file" in kw:
    f = kw['file']
  else:
    f = sys.stdout
  print(*xs,**kw)
  f.flush()

def hash_first(filename,chunk_size=64*1024):
  doprint(f"# Hashing first {chunk_size} of {filename}")
  with open(filename,"rb") as f:
    bytes = f.read(chunk_size)
    hash = hashlib.sha256(bytes).hexdigest()
    return hash

def hash_all(filename,chunk_size=1024*1024):
  doprint(f"# Hashing all of {filename}")
  sz = os.path.getsize(filename)
  x = 0
  sha = hashlib.sha256()
  i = 0
  with open(filename,"rb") as f:
    while byte_block := f.read(chunk_size):
      doprint("#",end="")
      sha.update(byte_block)
      x += len(byte_block)
      i += 1
      if i >= 30:
        pc = 100*(x/sz)
        doprint(f"# {pc:0.3f}%")
        i = 0
    doprint()
    return sha.hexdigest()


#roots = sys.argv[1:]
roots = []
for x in sys.argv[1:]:
  if x == "-q":
    quick = True
  elif x == "-Q":
    quick = False
  else:
    roots.append(x)

class T:
  def __init__(self,t=10):
    self.t = t
  def __call__(self):
    self.t -= 1
    if self.t <= 0:
      doprint(f"Exiting")
      sys.exit(0)

# t = T(1000)

# pass 1: compile by_size
doprint("# Finding files")
by_size = defaultdict(list)
for root in roots:
  for rt, dirs, files in os.walk(root):
    for f in files:
      doprint(".",end="")
      path = os.path.join(rt,f)
      sz = os.path.getsize(path)
      by_size[sz].append(path)
doprint("# Done finding files")

candidates = []
for sz,fs in by_size.items():
  if len(fs) > 1:
    candidates += fs
doprint(f"# {len(candidates)} candidates by size")
if len(candidates) == 0:
  exit(0)

# pass 2: compile by_hash64k
by_hash64k = defaultdict(list)
for c in candidates:
  h = hash_first(c,64*1024)
  by_hash64k[h].append(c)
candidates = []
for h,fs in by_hash64k.items():
  if len(fs) > 1:
    candidates += fs
doprint(f"# {len(candidates)} candidates by hash 64k")
if len(candidates) == 0:
  exit(0)

# pass 3: compilie by_hash1m
by_hash1m = defaultdict(list)
for c in candidates:
  h = hash_first(c,1024*1024)
  by_hash1m[h].append(c)
candidates = []
for h,fs in by_hash1m.items():
  if len(fs) > 1:
    candidates += fs
doprint(f"# {len(candidates)} candidates by hash 1M")
if len(candidates) == 0:
  exit(0)

if quick:
  dups = False
  for h,fs in by_hash1m.items():
    if len(fs) > 1:
      if not dups:
        dups = True
        doprint(f"Dups:\n=====\n\n")
      doprint(f"hash {h}:")
      for f in fs:
        doprint(f"  {f}")
else:
  # pass 4: compile by hashall
  by_hashall = defaultdict(list)
  dups = False
  for c in candidates:
    h = hash_all(c)
    by_hashall[h].append(c)
  for h,fs in by_hashall.items():
    if len(fs) > 1:
      if not dups:
        dups = True
        doprint(f"Dups:\n=====\n\n")
      doprint(f"hash {h}:")
      for f in fs:
        doprint(f"  {f}")

Minimalist version

The simplest example of a duplicate file finder is just to take the hash of every file. This is a large performance hit in the case of non-unique files (the version above first looks for files of the same size, then compares just the first 64k to weed out different files of the same size, then compares just the first 1M of any remaining candidate duplicate files, and only then resorts to hashing the entire file). But the code below shows how it can be done in only 13 lines of python, rather than just under 100.

#!/usr/bin/env python3
import hashlib,os,sys,collections
by_hash = collections.defaultdict(list)
hash_file = lambda filename: hashlib.sha256(open(filename,"rb").read()).hexdigest()
for arg in sys.argv[1:]:
  for root,dirs,files in os.walk(arg):
    for filename in files:
      path = f"{root}/{filename}"
      by_hash[hash_file(path)].append(path)
for hashvalue,filenames in by_hash.items():
  if len(filenames) > 1:
    print(hashvalue)
    for filename in filenames:
      print(f"  {filename}")

or alternatively just to hash the first 64k, which may give false positives, change the hash_file = line to

hash_file = lambda filename: hashlib.sha256(open(filename,"rb").read(64*1024)).hexdigest()

basically the more of the file you hash, the more likely it will detect differences. For things like multi-gigabyte video files, if the two files are not identical, then likely at least one byte in the first 64k will differ.