mirror of
https://github.com/VHellendoorn/Code-LMs.git
synced 2025-07-06 13:09:56 +08:00
57 lines
1.8 KiB
Python
57 lines
1.8 KiB
Python
"""Copies all files belonging to a given language to a new directory."""
|
|
import os
|
|
import sys
|
|
from shutil import copyfile
|
|
|
|
import pygments
|
|
from pygments.lexers import get_lexer_by_name
|
|
from pygments.token import Token
|
|
|
|
# Basic config options.
|
|
MAX_FILE_SIZE = 1024 ** 2 # 1 MB
|
|
MIN_FILE_TOKENS = 100
|
|
|
|
def main():
|
|
if len(sys.argv) <= 3:
|
|
raise ValueError('Provide a language, source directory and target directory.')
|
|
|
|
language = sys.argv[1]
|
|
proj_dir = sys.argv[2]
|
|
out_dir = sys.argv[3]
|
|
|
|
# Use Pygments to get language extensions.
|
|
lexer = get_lexer_by_name(language)
|
|
language_extensions = set(ext.lower()[1:] for ext in lexer.filenames)
|
|
|
|
print(f'Processing: {proj_dir}')
|
|
if not os.path.exists(out_dir):
|
|
os.makedirs(out_dir)
|
|
|
|
files_found = 0
|
|
for root, _, files in os.walk(proj_dir):
|
|
for file in files:
|
|
if any(file.endswith(ext) for ext in language_extensions):
|
|
in_path = os.path.join(root, file)
|
|
if not os.path.exists(in_path): # Can happen due to broken symlinks.
|
|
continue
|
|
if os.path.getsize(in_path) > MAX_FILE_SIZE: # Drop excessively long files.
|
|
continue
|
|
with open(in_path, errors='ignore') as f_in:
|
|
text = f_in.read()
|
|
if sum(1 for _ in pygments.lex(text, lexer)) < MIN_FILE_TOKENS: # Drop files with too few tokens.
|
|
continue
|
|
|
|
# Copy all other files to the target directory using a simplified path.
|
|
rel_path = root[len(proj_dir)+1:].replace('/', '__')
|
|
out_path = os.path.join(out_dir, rel_path + ('__' if rel_path else '') + file)
|
|
if not os.path.exists(out_path):
|
|
try:
|
|
copyfile(in_path, out_path)
|
|
except Exception as e:
|
|
print(f'Skipping problematic file {in_path} due to: {e}')
|
|
files_found += 1
|
|
print(f'Done processing; copied {files_found} files.')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |