Files
Code-LMs/Data/extract_code.py
2022-03-09 12:55:58 -05:00

57 lines
1.8 KiB
Python

"""Copies all files belonging to a given language to a new directory."""
import os
import sys
from shutil import copyfile
import pygments
from pygments.lexers import get_lexer_by_name
from pygments.token import Token
# Basic config options.
MAX_FILE_SIZE = 1024 ** 2 # 1 MB
MIN_FILE_TOKENS = 100
def main():
if len(sys.argv) <= 3:
raise ValueError('Provide a language, source directory and target directory.')
language = sys.argv[1]
proj_dir = sys.argv[2]
out_dir = sys.argv[3]
# Use Pygments to get language extensions.
lexer = get_lexer_by_name(language)
language_extensions = set(ext.lower()[1:] for ext in lexer.filenames)
print(f'Processing: {proj_dir}')
if not os.path.exists(out_dir):
os.makedirs(out_dir)
files_found = 0
for root, _, files in os.walk(proj_dir):
for file in files:
if any(file.endswith(ext) for ext in language_extensions):
in_path = os.path.join(root, file)
if not os.path.exists(in_path): # Can happen due to broken symlinks.
continue
if os.path.getsize(in_path) > MAX_FILE_SIZE: # Drop excessively long files.
continue
with open(in_path, errors='ignore') as f_in:
text = f_in.read()
if sum(1 for _ in pygments.lex(text, lexer)) < MIN_FILE_TOKENS: # Drop files with too few tokens.
continue
# Copy all other files to the target directory using a simplified path.
rel_path = root[len(proj_dir)+1:].replace('/', '__')
out_path = os.path.join(out_dir, rel_path + ('__' if rel_path else '') + file)
if not os.path.exists(out_path):
try:
copyfile(in_path, out_path)
except Exception as e:
print(f'Skipping problematic file {in_path} due to: {e}')
files_found += 1
print(f'Done processing; copied {files_found} files.')
if __name__ == '__main__':
main()