Files
Code-LMs/Data/deduplicate.py
2022-03-09 12:55:58 -05:00

28 lines
637 B
Python

import hashlib
import os
ROOT = 'Code' # NOTE: hard-coded.
seen = set()
count = 0
dups = 0
for root_dir, _, files in os.walk(ROOT):
for file in files:
count += 1
file_path = os.path.join(root_dir, file)
# Hash the entire file's content.
with open(file_path, 'rb') as f:
bytes = f.read()
hash = hashlib.sha256(bytes).hexdigest()
# Delete identical files.
if hash in seen:
os.remove(file_path)
dups += 1
else:
seen.add(hash)
# Periodically print progress and the running duplication ratio.
if count % 10000 == 0:
print(f'Processed {count:,} files, duplicates so far: {dups:,} ({dups/count:.1%})')