Files
Code-LMs/Data/collect_data.sh
2022-03-09 12:55:58 -05:00

25 lines
834 B
Bash

# Hand-picked set of languages.
langs=("C" "C#" "C++" "Go" "Java" "JavaScript" "PHP" "Python" "Ruby" "Rust" "Scala" "TypeScript")
if [ ! -d TopLists ]; then
mkdir TopLists;
fi
# Install required Python packages.
pip install -r requirements.txt
# Collect 25K repos with at least 50 stars.
# NOTE: the GH API neither guarantees nor (remotely) achieves completeness or consistency, so the resulting set of repositories will be different on each run.
# NOTE: make sure to insert your GH API key into the gh_crawler.py file.
for lang in ${langs[@]}; do
python3 gh_crawler.py $lang;
done
# Clone repositories in parallel and extract all language-specific files.
for lang in ${langs[@]}; do
cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang
done
# Deduplicate code files.
python3 deduplicate.py