mirror of
https://github.com/VHellendoorn/Code-LMs.git
synced 2025-07-06 21:39:32 +08:00
117 lines
3.7 KiB
Python
117 lines
3.7 KiB
Python
import requests
|
|
import sys
|
|
import time
|
|
|
|
# Insert GitHub API token here, in place of *TOKEN*.
|
|
headers = {"Authorization": "token *TOKEN*"}
|
|
|
|
# Constants & language argument.
|
|
NUM_REPOS = 25_000
|
|
MIN_STARS = 50
|
|
LAST_ACTIVE = '2020-01-01'
|
|
LANGUAGE = "java" if len(sys.argv) <= 1 else sys.argv[1] # Default to Java, if none passed.
|
|
|
|
def main():
|
|
repositories = set() # Keep track of a set of repositories seen to avoid duplicate entries across pages.
|
|
next_max_stars = 1_000_000_000 # Initialize to a very high value.
|
|
with open(f'TopLists/{LANGUAGE}-top-repos.txt', 'w') as f:
|
|
while len(repositories) < NUM_REPOS:
|
|
results = run_query(next_max_stars) # Get the next set of pages.
|
|
if not results:
|
|
break
|
|
new_repositories = [repository for repository, _ in results]
|
|
next_max_stars = min([stars for _, stars in results])
|
|
|
|
# If a query returns no new repositories, drop it.
|
|
if len(repositories | set(new_repositories)) == len(repositories):
|
|
break
|
|
for repository, stars in sorted(results, key=lambda e: e[1], reverse=True):
|
|
if repository not in repositories:
|
|
repositories.add(repository)
|
|
f.write(f'{stars}\t{repository}\n')
|
|
f.flush()
|
|
print(f'Collected {len(repositories):,} repositories so far; lowest number of stars: {next_max_stars:,}')
|
|
|
|
|
|
def run_query(max_stars):
|
|
end_cursor = None # Used to track pagination.
|
|
repositories = set()
|
|
|
|
while end_cursor != "":
|
|
# Extracts non-fork, recently active repositories in the provided language, in groups of 100.
|
|
# Leaves placeholders for maximum stars and page cursor. The former allows us to retrieve more than 1,000 repositories
|
|
# by repeatedly lowering the bar.
|
|
query = f"""
|
|
{{
|
|
search(query: "language:{LANGUAGE} fork:false pushed:>{LAST_ACTIVE} sort:stars stars:<{max_stars}", type: REPOSITORY, first: 100 {', after: "' + end_cursor + '"' if end_cursor else ''}) {{
|
|
edges {{
|
|
node {{
|
|
... on Repository {{
|
|
url
|
|
isPrivate
|
|
isDisabled
|
|
isLocked
|
|
stargazers {{
|
|
totalCount
|
|
}}
|
|
}}
|
|
}}
|
|
}}
|
|
pageInfo {{
|
|
hasNextPage
|
|
endCursor
|
|
}}
|
|
}}
|
|
}}
|
|
"""
|
|
print(f' Retrieving next page; {len(repositories)} repositories in this batch so far.')
|
|
# Attempt a query up to three times, pausing when a query limit is hit.
|
|
attempts = 0
|
|
success = False
|
|
while not success and attempts < 3:
|
|
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
|
|
content = request.json()
|
|
if 'data' not in content or 'search' not in content['data']:
|
|
# If this is simply a signal to pause querying, wait two minutes.
|
|
if 'message' in content and 'wait' in content['message']:
|
|
attempts += 1
|
|
time.sleep(120)
|
|
# Otherwise, assume we've hit the end of the stream.
|
|
else:
|
|
break
|
|
else:
|
|
success = True
|
|
if not success:
|
|
break
|
|
end_cursor = get_end_cursor(content)
|
|
new_repositories, is_done = get_repositories(content)
|
|
repositories.update(new_repositories)
|
|
if len(repositories) > NUM_REPOS or is_done:
|
|
break
|
|
return repositories
|
|
|
|
|
|
def get_end_cursor(content):
|
|
page_info = content['data']['search']['pageInfo']
|
|
has_next_page = page_info['hasNextPage']
|
|
if has_next_page:
|
|
return page_info['endCursor']
|
|
return ""
|
|
|
|
|
|
def get_repositories(content):
|
|
edges = content['data']['search']['edges']
|
|
repositories_with_stars = []
|
|
for edge in edges:
|
|
if edge['node']['isPrivate'] is False and edge['node']['isDisabled'] is False and edge['node']['isLocked'] is False:
|
|
repository = edge['node']['url']
|
|
star_count = edge['node']['stargazers']['totalCount']
|
|
if star_count < MIN_STARS:
|
|
return repositories_with_stars, True
|
|
repositories_with_stars.append((repository, star_count))
|
|
return repositories_with_stars, False
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|