mirror of
https://github.com/VHellendoorn/Code-LMs.git
synced 2025-07-06 13:09:56 +08:00
25 lines
834 B
Bash
25 lines
834 B
Bash
# Hand-picked set of languages.
|
|
langs=("C" "C#" "C++" "Go" "Java" "JavaScript" "PHP" "Python" "Ruby" "Rust" "Scala" "TypeScript")
|
|
|
|
if [ ! -d TopLists ]; then
|
|
mkdir TopLists;
|
|
fi
|
|
|
|
# Install required Python packages.
|
|
pip install -r requirements.txt
|
|
|
|
# Collect 25K repos with at least 50 stars.
|
|
# NOTE: the GH API neither guarantees nor (remotely) achieves completeness or consistency, so the resulting set of repositories will be different on each run.
|
|
# NOTE: make sure to insert your GH API key into the gh_crawler.py file.
|
|
for lang in ${langs[@]}; do
|
|
python3 gh_crawler.py $lang;
|
|
done
|
|
|
|
# Clone repositories in parallel and extract all language-specific files.
|
|
for lang in ${langs[@]}; do
|
|
cat 'TopLists/'$lang'-top-repos.txt' | xargs -P16 -n1 -I% bash clone_repo.sh % $lang
|
|
done
|
|
|
|
# Deduplicate code files.
|
|
python3 deduplicate.py
|