mirror of
https://github.com/VHellendoorn/Code-LMs.git
synced 2025-07-06 13:09:56 +08:00
28 lines
637 B
Python
28 lines
637 B
Python
import hashlib
|
|
import os
|
|
|
|
ROOT = 'Code' # NOTE: hard-coded.
|
|
seen = set()
|
|
count = 0
|
|
dups = 0
|
|
|
|
for root_dir, _, files in os.walk(ROOT):
|
|
for file in files:
|
|
count += 1
|
|
file_path = os.path.join(root_dir, file)
|
|
# Hash the entire file's content.
|
|
with open(file_path, 'rb') as f:
|
|
bytes = f.read()
|
|
hash = hashlib.sha256(bytes).hexdigest()
|
|
|
|
# Delete identical files.
|
|
if hash in seen:
|
|
os.remove(file_path)
|
|
dups += 1
|
|
else:
|
|
seen.add(hash)
|
|
|
|
# Periodically print progress and the running duplication ratio.
|
|
if count % 10000 == 0:
|
|
print(f'Processed {count:,} files, duplicates so far: {dups:,} ({dups/count:.1%})')
|