Add data collection scripts

2026-03-13 10:00:47 +08:00 · 2022-02-04 13:08:20 -05:00
parent 428043d3b5
commit 8c9b1f348f
11 changed files with 50312 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+TopLists/
+Code/
+Repos/
+Preprocessed/
--- a/Mining/README.md
+++ b/Mining/README.md
@@ -0,0 +1,16 @@
+## Purpose
+Scripts to construct a dataset of code in a similar way to the ones used to train the released models. Note that because of the nature of the GH API, the exact results of each query will be different, so this will not precisely replicate the training data.
+
+## Usage
+Update `gh_crawler.py` by adding your GH API token (line 6). Then, run `collect_data.sh`, which invokes the GitHub API crawler (`gh_crawler.py`), followed by a repo cloning script (`clone_repo.sh`, in parallel), which uses `extract_code.py` to extract all source code files in the corresponding language (and filter very long/short files), and finally `deduplicate.py` to remove duplicate files.
+
+Once this is completed, you can use [gpt-neox](https://github.com/EleutherAI/gpt-neox)'s `preprocess_data.py` (currently in `tools/`) to tokenize this dataset for the model, using a either the pretrained code vocabularies by providing the `code-vocab.json` and `code-merges.txt` files, or producing a new one.
+
+At the time of this writing*, the following command processes the entire `Code/` directory to a new directory named `Preprocessed/` using the pretrained vocabularies across 16 parallel workers (assuming that `gpt-neox` is checked out in the current directory):
+```
+mkdir Preprocessed
+sudo python3 gpt-neox/tools/preprocess_data.py --input Code --tokenizer-type GPT2BPETokenizer --vocab-file code-vocab.json --merge-file code-merges.txt --output-prefix Preprocessed/code --workers 16
+```
+And that's it! Just modify the `local_setup.yml` config in the gpt-neox toolkit to point it to the new vocab & merges file and data directory and it should be able to train.
+
+*I did have to modify the `yield_from_files` function to recursively yield all (shuffled) files from a directory; the default version uses `lm_dataformat`, which balks at code file extensions. The updated function can be found in `yield_from_code_files.py`.
--- a/Mining/clone_repo.sh
+++ b/Mining/clone_repo.sh
@@ -0,0 +1,25 @@
+# Clone a given repository, extract any files belonging to the given language, and delete the repository afterwards to save space.
+in=$1
+language=$2
+
+# Extract the org and name from lines formatted as stars\thttps://github.com/org/name
+repo=$(echo $in | cut -d$'\t' -f2);
+name_part=$(echo $repo | cut -d"/" -f4-6);
+name=$(echo $name_part | cut -d"/" -f2);
+org=$(echo $name_part | cut -d"/" -f1);
+echo "Cloning $org/$name"
+DIR=Repos/$language/$org; \
+OUT=Code/$language/$org; \
+# Skip repositories for which we already have extracted code files.
+if [ -d $OUT/$name ]; then echo "deja vu"; exit; fi;
+mkdir -p $DIR; \
+mkdir -p $OUT; \
+
+# Clone with depth=1 to only get most recent files, rather than entire history.
+if [ ! -d $DIR/$name ]; then
+  git clone -q --depth 1 https://github.com/$org/$name $DIR/$name;
+fi;
+
+# Extract all language-specific code files from the repository and delete it afterwards.
+python3 extract_code.py $language $DIR/$name $OUT/$name;
+rm -rf $DIR/$name
--- a/Mining/code-merges.txt
+++ b/Mining/code-merges.txt
--- a/Mining/code-vocab.json
+++ b/Mining/code-vocab.json
--- a/Mining/collect_data.sh
+++ b/Mining/collect_data.sh
@@ -0,0 +1,24 @@
+# Hand-picked set of languages.
+langs=("C" "C#" "C++" "Go" "Java" "JavaScript" "PHP" "Python" "Ruby" "Rust" "Scala" "TypeScript")
+
+if [ ! -d TopLists ]; then
+  mkdir TopLists;
+fi
+
+# Install required Python packages.
+pip install -r requirements.txt
+
+# Collect 25K repos with at least 50 stars.
+# NOTE: the GH API neither guarantees nor (remotely) achieves completeness or consistency, so the resulting set of repositories will be different on each run.
+# NOTE: make sure to insert your GH API key into the gh_crawler.py file.
+for lang in ${langs[@]}; do
+  python3 gh_crawler.py $lang;
+done
+
+# Clone repositories in parallel and extract all language-specific files.
+for lang in ${langs[@]}; do
+  cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang
+done
+
+# Deduplicate code files.
+python3 deduplicate.py
--- a/Mining/deduplicate.py
+++ b/Mining/deduplicate.py
@@ -0,0 +1,27 @@
+import hashlib
+import os
+
+ROOT = 'Code'  # NOTE: hard-coded.
+seen = set()
+count = 0
+dups = 0
+
+for root_dir, _, files in os.walk(ROOT):
+	for file in files:
+		count += 1
+		file_path = os.path.join(root_dir, file)
+		# Hash the entire file's content.
+		with open(file_path, 'rb') as f:
+			bytes = f.read()
+			hash = hashlib.sha256(bytes).hexdigest()
+
+		# Delete identical files.
+		if hash in seen:
+			os.remove(file_path)
+			dups += 1
+		else:
+			seen.add(hash)
+
+		# Periodically print progress and the running duplication ratio.
+		if count % 10000 == 0:
+			print(f'Processed {count:,} files, duplicates so far: {dups:,} ({dups/count:.1%})')
--- a/Mining/extract_code.py
+++ b/Mining/extract_code.py
@@ -0,0 +1,57 @@
+"""Copies all files belonging to a given language to a new directory."""
+import os
+import sys
+from shutil import copyfile
+
+import pygments
+from pygments.lexers import get_lexer_by_name
+from pygments.token import Token
+
+# Basic config options.
+MAX_FILE_SIZE = 1024 ** 2  # 1 MB
+MIN_FILE_TOKENS = 100
+
+def main():
+	if len(sys.argv) <= 3:
+		raise ValueError('Provide a language, source directory and target directory.')
+
+	language = sys.argv[1]
+	proj_dir = sys.argv[2]
+	out_dir = sys.argv[3]
+	
+	# Use Pygments to get language extensions.
+	lexer = get_lexer_by_name(language)
+	language_extensions = set(ext.lower()[1:] for ext in lexer.filenames)
+
+	print(f'Processing: {proj_dir}')
+	if not os.path.exists(out_dir):
+		os.makedirs(out_dir)
+
+	files_found = 0
+	for root, _, files in os.walk(proj_dir):
+		for file in files:
+			if any(file.endswith(ext) for ext in language_extensions):
+				in_path = os.path.join(root, file)
+				if not os.path.exists(in_path):  # Can happen due to broken symlinks.
+					continue
+				if os.path.getsize(in_path) > MAX_FILE_SIZE:  # Drop excessively long files.
+					continue
+				with open(in_path, errors='ignore') as f_in:
+					text = f_in.read()
+				if sum(1 for _ in pygments.lex(text, lexer)) < MIN_FILE_TOKENS:  # Drop files with too few tokens.
+					continue
+
+				# Copy all other files to the target directory using a simplified path.
+				rel_path = root[len(proj_dir)+1:].replace('/', '__')
+				out_path = os.path.join(out_dir, rel_path + ('__' if rel_path else '') + file)
+				if not os.path.exists(out_path):
+					try:
+						copyfile(in_path, out_path)
+					except Exception as e:
+						print(f'Skipping problematic file {in_path} due to: {e}')
+				files_found += 1
+	print(f'Done processing; copied {files_found} files.')
+
+
+if __name__ == '__main__':
+	main()
--- a/Mining/gh_crawler.py
+++ b/Mining/gh_crawler.py
@@ -0,0 +1,116 @@
+import requests
+import sys
+import time
+
+# Insert GitHub API token here, in place of *TOKEN*.
+headers = {"Authorization": "token *TOKEN*"}
+
+# Constants & language argument.
+NUM_REPOS = 25_000
+MIN_STARS = 50
+LAST_ACTIVE = '2020-01-01'
+LANGUAGE = "java" if len(sys.argv) <= 1 else sys.argv[1]  # Default to Java, if none passed.
+
+def main():
+	repositories = set()  # Keep track of a set of repositories seen to avoid duplicate entries across pages.
+	next_max_stars = 1_000_000_000  # Initialize to a very high value.
+	with open(f'TopLists/{LANGUAGE}-top-repos.txt', 'w') as f:
+		while len(repositories) < NUM_REPOS:
+			results = run_query(max_stars)  # Get the next set of pages.
+			if not results:
+				break
+			new_repositories = [repository for repository, _ in results]
+			next_max_stars = min([stars for _, stars in results])
+			
+			# If a query returns no new repositories, drop it.
+			if len(repositories | set(new_repositories)) == len(repositories):
+				break
+			for repository, stars in sorted(results, key=lambda e: e[1], reverse=True):
+				if repository not in repositories:
+					repositories.add(repository)
+					f.write(f'{stars}\t{repository}\n')
+			f.flush()
+			print(f'Collected {len(repositories):,} repositories so far; lowest number of stars: {next_max_stars:,}')
+
+
+def run_query(max_stars):
+	end_cursor = None  # Used to track pagination.
+	repositories = set()
+	
+	while end_cursor != "":
+		# Extracts non-fork, recently active repositories in the provided language, in groups of 100.
+		# Leaves placeholders for maximum stars and page cursor. The former allows us to retrieve more than 1,000 repositories
+		# by repeatedly lowering the bar.
+		query = f"""
+		{{
+		  search(query: "language:{LANGUAGE} fork:false pushed:>{LAST_ACTIVE} sort:stars stars:<{max_stars}", type: REPOSITORY, first: 100 {', after: "' + end_cursor + '"' if end_cursor else ''}) {{
+			edges {{
+			  node {{
+				... on Repository {{
+				  url
+				  isPrivate
+				  isDisabled
+				  isLocked
+				  stargazers {{
+					totalCount
+				  }}
+				}}
+			  }}
+			}}
+			pageInfo {{
+			  hasNextPage
+			  endCursor
+			}}
+		  }}
+		}}
+		"""
+		print(f'  Retrieving next page; {len(repositories)} repositories in this batch so far.')
+		# Attempt a query up to three times, pausing when a query limit is hit.
+		attempts = 0
+		success = False
+		while not success and attempts < 3:
+			request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
+			content = request.json()
+			if 'data' not in content or 'search' not in content['data']:
+				# If this is simply a signal to pause querying, wait two minutes.
+				if 'message' in content and 'wait' in content['message']:
+					attempts += 1
+					time.sleep(120)
+				# Otherwise, assume we've hit the end of the stream.
+				else:
+					break
+			else:
+				success = True
+		if not success:
+			break
+		end_cursor = get_end_cursor(content)
+		new_repositories, is_done = get_repositories(content)
+		repositories.update(new_repositories)
+		if len(repositories) > NUM_REPOS or is_done:
+			break
+	return repositories
+
+
+def get_end_cursor(content):
+	page_info = content['data']['search']['pageInfo']
+	has_next_page = page_info['hasNextPage']
+	if has_next_page:
+		return page_info['endCursor']
+	return ""
+
+
+def get_repositories(content):
+	edges = content['data']['search']['edges']
+	repositories_with_stars = []
+	for edge in edges:
+		if edge['node']['isPrivate'] is False and edge['node']['isDisabled'] is False and edge['node']['isLocked'] is False:
+			repository = edge['node']['url']
+			star_count = edge['node']['stargazers']['totalCount']
+			if star_count < MIN_STARS:
+				return repositories_with_stars, True
+			repositories_with_stars.append((repository, star_count))
+	return repositories_with_stars, False
+
+
+if __name__ == '__main__':
+	main()
--- a/Mining/requirements.txt
+++ b/Mining/requirements.txt
@@ -0,0 +1,2 @@
+# Just need Pygments for lexing.
+pygments
--- a/Mining/yield_from_code_files.py
+++ b/Mining/yield_from_code_files.py
@@ -0,0 +1,28 @@
+"""A drop-in replacement for `yield_from_files` in the gpt-neox tools/preprocess_data.py version which does not rely on lm_dataformat."""
+
+import random
+def yield_from_files(dir, semaphore):
+    """
+    Iterator over input documents, treated as plaintext.
+
+    :param dir: directory to recursively extract files from.
+	"""
+    fnames = []
+    for root, _, files in os.walk(dir):
+        for file in files:
+            fnames.append(os.path.join(root, file))
+    random.shuffle(fnames)
+
+    def read(fname):
+        with open(fname) as inp:
+            doc = inp.read()
+        return doc
+
+    def yielder(fname, semaphore):
+        f = read(fname)
+        if f:
+            semaphore.acquire()
+            yield f
+
+    for fname in fnames:
+        yield from yielder(fname, semaphore)