Add data collection scripts

This commit is contained in:
Vincent Hellendoorn
2022-02-04 13:08:20 -05:00
parent 428043d3b5
commit 8c9b1f348f
11 changed files with 50312 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
TopLists/
Code/
Repos/
Preprocessed/

16
Mining/README.md Normal file
View File

@@ -0,0 +1,16 @@
## Purpose
Scripts to construct a dataset of code in a similar way to the ones used to train the released models. Note that because of the nature of the GH API, the exact results of each query will be different, so this will not precisely replicate the training data.
## Usage
Update `gh_crawler.py` by adding your GH API token (line 6). Then, run `collect_data.sh`, which invokes the GitHub API crawler (`gh_crawler.py`), followed by a repo cloning script (`clone_repo.sh`, in parallel), which uses `extract_code.py` to extract all source code files in the corresponding language (and filter very long/short files), and finally `deduplicate.py` to remove duplicate files.
Once this is completed, you can use [gpt-neox](https://github.com/EleutherAI/gpt-neox)'s `preprocess_data.py` (currently in `tools/`) to tokenize this dataset for the model, using a either the pretrained code vocabularies by providing the `code-vocab.json` and `code-merges.txt` files, or producing a new one.
At the time of this writing*, the following command processes the entire `Code/` directory to a new directory named `Preprocessed/` using the pretrained vocabularies across 16 parallel workers (assuming that `gpt-neox` is checked out in the current directory):
```
mkdir Preprocessed
sudo python3 gpt-neox/tools/preprocess_data.py --input Code --tokenizer-type GPT2BPETokenizer --vocab-file code-vocab.json --merge-file code-merges.txt --output-prefix Preprocessed/code --workers 16
```
And that's it! Just modify the `local_setup.yml` config in the gpt-neox toolkit to point it to the new vocab & merges file and data directory and it should be able to train.
*I did have to modify the `yield_from_files` function to recursively yield all (shuffled) files from a directory; the default version uses `lm_dataformat`, which balks at code file extensions. The updated function can be found in `yield_from_code_files.py`.

25
Mining/clone_repo.sh Normal file
View File

@@ -0,0 +1,25 @@
# Clone a given repository, extract any files belonging to the given language, and delete the repository afterwards to save space.
in=$1
language=$2
# Extract the org and name from lines formatted as stars\thttps://github.com/org/name
repo=$(echo $in | cut -d$'\t' -f2);
name_part=$(echo $repo | cut -d"/" -f4-6);
name=$(echo $name_part | cut -d"/" -f2);
org=$(echo $name_part | cut -d"/" -f1);
echo "Cloning $org/$name"
DIR=Repos/$language/$org; \
OUT=Code/$language/$org; \
# Skip repositories for which we already have extracted code files.
if [ -d $OUT/$name ]; then echo "deja vu"; exit; fi;
mkdir -p $DIR; \
mkdir -p $OUT; \
# Clone with depth=1 to only get most recent files, rather than entire history.
if [ ! -d $DIR/$name ]; then
git clone -q --depth 1 https://github.com/$org/$name $DIR/$name;
fi;
# Extract all language-specific code files from the repository and delete it afterwards.
python3 extract_code.py $language $DIR/$name $OUT/$name;
rm -rf $DIR/$name

50012
Mining/code-merges.txt Normal file
View File

File diff suppressed because it is too large Load Diff

1
Mining/code-vocab.json Normal file
View File

File diff suppressed because one or more lines are too long

24
Mining/collect_data.sh Normal file
View File

@@ -0,0 +1,24 @@
# Hand-picked set of languages.
langs=("C" "C#" "C++" "Go" "Java" "JavaScript" "PHP" "Python" "Ruby" "Rust" "Scala" "TypeScript")
if [ ! -d TopLists ]; then
mkdir TopLists;
fi
# Install required Python packages.
pip install -r requirements.txt
# Collect 25K repos with at least 50 stars.
# NOTE: the GH API neither guarantees nor (remotely) achieves completeness or consistency, so the resulting set of repositories will be different on each run.
# NOTE: make sure to insert your GH API key into the gh_crawler.py file.
for lang in ${langs[@]}; do
python3 gh_crawler.py $lang;
done
# Clone repositories in parallel and extract all language-specific files.
for lang in ${langs[@]}; do
cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang
done
# Deduplicate code files.
python3 deduplicate.py

27
Mining/deduplicate.py Normal file
View File

@@ -0,0 +1,27 @@
import hashlib
import os
ROOT = 'Code' # NOTE: hard-coded.
seen = set()
count = 0
dups = 0
for root_dir, _, files in os.walk(ROOT):
for file in files:
count += 1
file_path = os.path.join(root_dir, file)
# Hash the entire file's content.
with open(file_path, 'rb') as f:
bytes = f.read()
hash = hashlib.sha256(bytes).hexdigest()
# Delete identical files.
if hash in seen:
os.remove(file_path)
dups += 1
else:
seen.add(hash)
# Periodically print progress and the running duplication ratio.
if count % 10000 == 0:
print(f'Processed {count:,} files, duplicates so far: {dups:,} ({dups/count:.1%})')

57
Mining/extract_code.py Normal file
View File

@@ -0,0 +1,57 @@
"""Copies all files belonging to a given language to a new directory."""
import os
import sys
from shutil import copyfile
import pygments
from pygments.lexers import get_lexer_by_name
from pygments.token import Token
# Basic config options.
MAX_FILE_SIZE = 1024 ** 2 # 1 MB
MIN_FILE_TOKENS = 100
def main():
if len(sys.argv) <= 3:
raise ValueError('Provide a language, source directory and target directory.')
language = sys.argv[1]
proj_dir = sys.argv[2]
out_dir = sys.argv[3]
# Use Pygments to get language extensions.
lexer = get_lexer_by_name(language)
language_extensions = set(ext.lower()[1:] for ext in lexer.filenames)
print(f'Processing: {proj_dir}')
if not os.path.exists(out_dir):
os.makedirs(out_dir)
files_found = 0
for root, _, files in os.walk(proj_dir):
for file in files:
if any(file.endswith(ext) for ext in language_extensions):
in_path = os.path.join(root, file)
if not os.path.exists(in_path): # Can happen due to broken symlinks.
continue
if os.path.getsize(in_path) > MAX_FILE_SIZE: # Drop excessively long files.
continue
with open(in_path, errors='ignore') as f_in:
text = f_in.read()
if sum(1 for _ in pygments.lex(text, lexer)) < MIN_FILE_TOKENS: # Drop files with too few tokens.
continue
# Copy all other files to the target directory using a simplified path.
rel_path = root[len(proj_dir)+1:].replace('/', '__')
out_path = os.path.join(out_dir, rel_path + ('__' if rel_path else '') + file)
if not os.path.exists(out_path):
try:
copyfile(in_path, out_path)
except Exception as e:
print(f'Skipping problematic file {in_path} due to: {e}')
files_found += 1
print(f'Done processing; copied {files_found} files.')
if __name__ == '__main__':
main()

116
Mining/gh_crawler.py Normal file
View File

@@ -0,0 +1,116 @@
import requests
import sys
import time
# Insert GitHub API token here, in place of *TOKEN*.
headers = {"Authorization": "token *TOKEN*"}
# Constants & language argument.
NUM_REPOS = 25_000
MIN_STARS = 50
LAST_ACTIVE = '2020-01-01'
LANGUAGE = "java" if len(sys.argv) <= 1 else sys.argv[1] # Default to Java, if none passed.
def main():
repositories = set() # Keep track of a set of repositories seen to avoid duplicate entries across pages.
next_max_stars = 1_000_000_000 # Initialize to a very high value.
with open(f'TopLists/{LANGUAGE}-top-repos.txt', 'w') as f:
while len(repositories) < NUM_REPOS:
results = run_query(max_stars) # Get the next set of pages.
if not results:
break
new_repositories = [repository for repository, _ in results]
next_max_stars = min([stars for _, stars in results])
# If a query returns no new repositories, drop it.
if len(repositories | set(new_repositories)) == len(repositories):
break
for repository, stars in sorted(results, key=lambda e: e[1], reverse=True):
if repository not in repositories:
repositories.add(repository)
f.write(f'{stars}\t{repository}\n')
f.flush()
print(f'Collected {len(repositories):,} repositories so far; lowest number of stars: {next_max_stars:,}')
def run_query(max_stars):
end_cursor = None # Used to track pagination.
repositories = set()
while end_cursor != "":
# Extracts non-fork, recently active repositories in the provided language, in groups of 100.
# Leaves placeholders for maximum stars and page cursor. The former allows us to retrieve more than 1,000 repositories
# by repeatedly lowering the bar.
query = f"""
{{
search(query: "language:{LANGUAGE} fork:false pushed:>{LAST_ACTIVE} sort:stars stars:<{max_stars}", type: REPOSITORY, first: 100 {', after: "' + end_cursor + '"' if end_cursor else ''}) {{
edges {{
node {{
... on Repository {{
url
isPrivate
isDisabled
isLocked
stargazers {{
totalCount
}}
}}
}}
}}
pageInfo {{
hasNextPage
endCursor
}}
}}
}}
"""
print(f' Retrieving next page; {len(repositories)} repositories in this batch so far.')
# Attempt a query up to three times, pausing when a query limit is hit.
attempts = 0
success = False
while not success and attempts < 3:
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
content = request.json()
if 'data' not in content or 'search' not in content['data']:
# If this is simply a signal to pause querying, wait two minutes.
if 'message' in content and 'wait' in content['message']:
attempts += 1
time.sleep(120)
# Otherwise, assume we've hit the end of the stream.
else:
break
else:
success = True
if not success:
break
end_cursor = get_end_cursor(content)
new_repositories, is_done = get_repositories(content)
repositories.update(new_repositories)
if len(repositories) > NUM_REPOS or is_done:
break
return repositories
def get_end_cursor(content):
page_info = content['data']['search']['pageInfo']
has_next_page = page_info['hasNextPage']
if has_next_page:
return page_info['endCursor']
return ""
def get_repositories(content):
edges = content['data']['search']['edges']
repositories_with_stars = []
for edge in edges:
if edge['node']['isPrivate'] is False and edge['node']['isDisabled'] is False and edge['node']['isLocked'] is False:
repository = edge['node']['url']
star_count = edge['node']['stargazers']['totalCount']
if star_count < MIN_STARS:
return repositories_with_stars, True
repositories_with_stars.append((repository, star_count))
return repositories_with_stars, False
if __name__ == '__main__':
main()

2
Mining/requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
# Just need Pygments for lexing.
pygments

View File

@@ -0,0 +1,28 @@
"""A drop-in replacement for `yield_from_files` in the gpt-neox tools/preprocess_data.py version which does not rely on lm_dataformat."""
import random
def yield_from_files(dir, semaphore):
"""
Iterator over input documents, treated as plaintext.
:param dir: directory to recursively extract files from.
"""
fnames = []
for root, _, files in os.walk(dir):
for file in files:
fnames.append(os.path.join(root, file))
random.shuffle(fnames)
def read(fname):
with open(fname) as inp:
doc = inp.read()
return doc
def yielder(fname, semaphore):
f = read(fname)
if f:
semaphore.acquire()
yield f
for fname in fnames:
yield from yielder(fname, semaphore)