import hashlib
import os

ROOT = 'Code'  # NOTE: hard-coded.
seen = set()
count = 0
dups = 0

for root_dir, _, files in os.walk(ROOT):
	for file in files:
		count += 1
		file_path = os.path.join(root_dir, file)
		# Hash the entire file's content.
		with open(file_path, 'rb') as f:
			bytes = f.read()
			hash = hashlib.sha256(bytes).hexdigest()

		# Delete identical files.
		if hash in seen:
			os.remove(file_path)
			dups += 1
		else:
			seen.add(hash)

		# Periodically print progress and the running duplication ratio.
		if count % 10000 == 0:
			print(f'Processed {count:,} files, duplicates so far: {dups:,} ({dups/count:.1%})')