blob: 5d5c74272479981013e887b25b77ef245c5e6703 [file] [log] [blame]
from datetime import datetime
from typing import Dict, List, Tuple
import pandas as pd
import requests
from hamilton.function_modifiers import save_to, value
from hamilton.htypes import Collect, Parallelizable
def starcount_url(repositories: List[str]) -> Parallelizable[str]:
"""Generates API URLs for counting stars on a repo. We do this
so we can paginate requests later.
:param repo: The repository name in the format 'organization/repo'
:return: A URL to the GitHub API
"""
for repo in repositories:
yield f"https://api.github.com/repos/{repo}"
def star_count(starcount_url: str, github_api_key: str) -> Tuple[str, int]:
"""Generates the star count for a given repo.
:param starcount_url: URL of the repo
:param github_api_key: API key for GitHub
:return: A tuple of the repo name and the star count
"""
response = requests.get(starcount_url, headers={"Authorization": f"token {github_api_key}"})
response.raise_for_status() # Raise an exception for unsuccessful requests
data = response.json()
return data["full_name"], data["stargazers_count"]
def stars_by_repo(star_count: Collect[Tuple[str, int]]) -> Dict[str, int]:
"""Aggregates the star count for each repo into a dictionary, so we
can generate paginated requests.
:param star_count: A tuple of the repo name and the star count
:return: The star count for each repo
"""
star_count_dict = {}
for repo_name, stars in star_count:
star_count_dict[repo_name] = stars
return star_count_dict
def stargazer_url(stars_by_repo: Dict[str, int], per_page: int = 100) -> Parallelizable[str]:
"""Generates query objects for each repository, with the correct pagination and offset.
:param stars_by_repo: The star count for each repo
:param per_page: The number of results per page
:return: A query object for each repo, formatted as a generator.
"""
for repo_name, stars in stars_by_repo.items():
num_pages = (
stars + per_page - 1
) // per_page # Calculate the number of pages needed for pagination
for page in range(num_pages):
yield f"https://api.github.com/repos/{repo_name}/stargazers?page={page + 1}&per_page={per_page}"
def stargazers(stargazer_url: str, github_api_key: str) -> pd.DataFrame:
"""Gives the GitHub username of all stargazers in this query
by hitting the GitHub API.
:param stargazer_query: Query object to represent the query
:param github_api_key: API key for GitHub
:return: A set of all stargazers
"""
headers = {
"Authorization": f"token {github_api_key}",
"Accept": "application/vnd.github.v3.star+json",
}
response = requests.get(stargazer_url, headers=headers)
response.raise_for_status() # Raise an exception for unsuccessful requests
data = response.json()
records = [
{
"user": datum["user"]["login"],
"starred_at": datetime.strptime(datum["starred_at"], "%Y-%m-%dT%H:%M:%SZ"),
}
for datum in data
]
return pd.DataFrame.from_records(records)
@save_to.csv(path=value("unique_stargazers.csv"))
def unique_stargazers(stargazers: Collect[pd.DataFrame]) -> pd.DataFrame:
"""Aggregates all stargazers into a single set.
:param stargazers: Set of stargazers, paginated
:return: A set of all stargazers
"""
df = pd.concat(stargazers)
unique = df.sort_values("starred_at").groupby("user").first()
return unique
def final_count(unique_stargazers: pd.DataFrame) -> int:
"""Counts the number of unique stargazers.
:param unique_stargazers: Set of all stargazers
:return: The number of unique stargazers
"""
return len(unique_stargazers)