Files
Code-LMs/Data/gh_crawler.py
2022-03-09 12:55:58 -05:00

117 lines
3.7 KiB
Python

import requests
import sys
import time
# Insert GitHub API token here, in place of *TOKEN*.
headers = {"Authorization": "token *TOKEN*"}
# Constants & language argument.
NUM_REPOS = 25_000
MIN_STARS = 50
LAST_ACTIVE = '2020-01-01'
LANGUAGE = "java" if len(sys.argv) <= 1 else sys.argv[1] # Default to Java, if none passed.
def main():
repositories = set() # Keep track of a set of repositories seen to avoid duplicate entries across pages.
next_max_stars = 1_000_000_000 # Initialize to a very high value.
with open(f'TopLists/{LANGUAGE}-top-repos.txt', 'w') as f:
while len(repositories) < NUM_REPOS:
results = run_query(next_max_stars) # Get the next set of pages.
if not results:
break
new_repositories = [repository for repository, _ in results]
next_max_stars = min([stars for _, stars in results])
# If a query returns no new repositories, drop it.
if len(repositories | set(new_repositories)) == len(repositories):
break
for repository, stars in sorted(results, key=lambda e: e[1], reverse=True):
if repository not in repositories:
repositories.add(repository)
f.write(f'{stars}\t{repository}\n')
f.flush()
print(f'Collected {len(repositories):,} repositories so far; lowest number of stars: {next_max_stars:,}')
def run_query(max_stars):
end_cursor = None # Used to track pagination.
repositories = set()
while end_cursor != "":
# Extracts non-fork, recently active repositories in the provided language, in groups of 100.
# Leaves placeholders for maximum stars and page cursor. The former allows us to retrieve more than 1,000 repositories
# by repeatedly lowering the bar.
query = f"""
{{
search(query: "language:{LANGUAGE} fork:false pushed:>{LAST_ACTIVE} sort:stars stars:<{max_stars}", type: REPOSITORY, first: 100 {', after: "' + end_cursor + '"' if end_cursor else ''}) {{
edges {{
node {{
... on Repository {{
url
isPrivate
isDisabled
isLocked
stargazers {{
totalCount
}}
}}
}}
}}
pageInfo {{
hasNextPage
endCursor
}}
}}
}}
"""
print(f' Retrieving next page; {len(repositories)} repositories in this batch so far.')
# Attempt a query up to three times, pausing when a query limit is hit.
attempts = 0
success = False
while not success and attempts < 3:
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
content = request.json()
if 'data' not in content or 'search' not in content['data']:
# If this is simply a signal to pause querying, wait two minutes.
if 'message' in content and 'wait' in content['message']:
attempts += 1
time.sleep(120)
# Otherwise, assume we've hit the end of the stream.
else:
break
else:
success = True
if not success:
break
end_cursor = get_end_cursor(content)
new_repositories, is_done = get_repositories(content)
repositories.update(new_repositories)
if len(repositories) > NUM_REPOS or is_done:
break
return repositories
def get_end_cursor(content):
page_info = content['data']['search']['pageInfo']
has_next_page = page_info['hasNextPage']
if has_next_page:
return page_info['endCursor']
return ""
def get_repositories(content):
edges = content['data']['search']['edges']
repositories_with_stars = []
for edge in edges:
if edge['node']['isPrivate'] is False and edge['node']['isDisabled'] is False and edge['node']['isLocked'] is False:
repository = edge['node']['url']
star_count = edge['node']['stargazers']['totalCount']
if star_count < MIN_STARS:
return repositories_with_stars, True
repositories_with_stars.append((repository, star_count))
return repositories_with_stars, False
if __name__ == '__main__':
main()