examples/parallelism/star_counting/functions.py - hamilton - Git at Google

 from datetime import datetime
 from typing import Dict, List, Tuple

 import pandas as pd
 import requests

 from hamilton.function_modifiers import save_to, value
 from hamilton.htypes import Collect, Parallelizable


 def starcount_url(repositories: List[str]) -> Parallelizable[str]:
     """Generates API URLs for counting stars on a repo. We do this
     so we can paginate requests later.


     :param repo: The repository name in the format 'organization/repo'
     :return: A URL to the GitHub API
     """
     for repo in repositories:
         yield f"https://api.github.com/repos/{repo}"


 def star_count(starcount_url: str, github_api_key: str) -> Tuple[str, int]:
     """Generates the star count for a given repo.

     :param starcount_url: URL of the repo
     :param github_api_key: API key for GitHub
     :return:  A tuple of the repo name and the star count
     """
     response = requests.get(starcount_url, headers={"Authorization": f"token {github_api_key}"})
     response.raise_for_status()  # Raise an exception for unsuccessful requests

     data = response.json()

     return data["full_name"], data["stargazers_count"]


 def stars_by_repo(star_count: Collect[Tuple[str, int]]) -> Dict[str, int]:
     """Aggregates the star count for each repo into a dictionary, so we
     can generate paginated requests.

     :param star_count:  A tuple of the repo name and the star count
     :return: The star count for each repo
     """
     star_count_dict = {}
     for repo_name, stars in star_count:
         star_count_dict[repo_name] = stars
     return star_count_dict


 def stargazer_url(stars_by_repo: Dict[str, int], per_page: int = 100) -> Parallelizable[str]:
     """Generates query objects for each repository, with the correct pagination and offset.

     :param stars_by_repo: The star count for each repo
     :param per_page: The number of results per page
     :return: A query object for each repo, formatted as a generator.
     """
     for repo_name, stars in stars_by_repo.items():
         num_pages = (
             stars + per_page - 1
         ) // per_page  # Calculate the number of pages needed for pagination
         for page in range(num_pages):
             yield f"https://api.github.com/repos/{repo_name}/stargazers?page={page + 1}&per_page={per_page}"


 def stargazers(stargazer_url: str, github_api_key: str) -> pd.DataFrame:
     """Gives the GitHub username of all stargazers in this query
     by hitting the GitHub API.

     :param stargazer_query: Query object to represent the query
     :param github_api_key: API key for GitHub
     :return: A set of all stargazers
     """
     headers = {
         "Authorization": f"token {github_api_key}",
         "Accept": "application/vnd.github.v3.star+json",
     }

     response = requests.get(stargazer_url, headers=headers)
     response.raise_for_status()  # Raise an exception for unsuccessful requests

     data = response.json()
     records = [
         {
             "user": datum["user"]["login"],
             "starred_at": datetime.strptime(datum["starred_at"], "%Y-%m-%dT%H:%M:%SZ"),
         }
         for datum in data
     ]
     return pd.DataFrame.from_records(records)


 @save_to.csv(path=value("unique_stargazers.csv"))
 def unique_stargazers(stargazers: Collect[pd.DataFrame]) -> pd.DataFrame:
     """Aggregates all stargazers into a single set.

     :param stargazers: Set of stargazers, paginated
     :return: A set of all stargazers
     """
     df = pd.concat(stargazers)
     unique = df.sort_values("starred_at").groupby("user").first()
     return unique


 def final_count(unique_stargazers: pd.DataFrame) -> int:
     """Counts the number of unique stargazers.

     :param unique_stargazers: Set of all stargazers
     :return: The number of unique stargazers
     """
     return len(unique_stargazers)
	from datetime import datetime
	from typing import Dict, List, Tuple

	import pandas as pd
	import requests

	from hamilton.function_modifiers import save_to, value
	from hamilton.htypes import Collect, Parallelizable


	def starcount_url(repositories: List[str]) -> Parallelizable[str]:
	"""Generates API URLs for counting stars on a repo. We do this
	so we can paginate requests later.


	:param repo: The repository name in the format 'organization/repo'
	:return: A URL to the GitHub API
	"""
	for repo in repositories:
	yield f"https://api.github.com/repos/{repo}"


	def star_count(starcount_url: str, github_api_key: str) -> Tuple[str, int]:
	"""Generates the star count for a given repo.

	:param starcount_url: URL of the repo
	:param github_api_key: API key for GitHub
	:return: A tuple of the repo name and the star count
	"""
	response = requests.get(starcount_url, headers={"Authorization": f"token {github_api_key}"})
	response.raise_for_status() # Raise an exception for unsuccessful requests

	data = response.json()

	return data["full_name"], data["stargazers_count"]


	def stars_by_repo(star_count: Collect[Tuple[str, int]]) -> Dict[str, int]:
	"""Aggregates the star count for each repo into a dictionary, so we
	can generate paginated requests.

	:param star_count: A tuple of the repo name and the star count
	:return: The star count for each repo
	"""
	star_count_dict = {}
	for repo_name, stars in star_count:
	star_count_dict[repo_name] = stars
	return star_count_dict


	def stargazer_url(stars_by_repo: Dict[str, int], per_page: int = 100) -> Parallelizable[str]:
	"""Generates query objects for each repository, with the correct pagination and offset.

	:param stars_by_repo: The star count for each repo
	:param per_page: The number of results per page
	:return: A query object for each repo, formatted as a generator.
	"""
	for repo_name, stars in stars_by_repo.items():
	num_pages = (
	stars + per_page - 1
	) // per_page # Calculate the number of pages needed for pagination
	for page in range(num_pages):
	yield f"https://api.github.com/repos/{repo_name}/stargazers?page={page + 1}&per_page={per_page}"


	def stargazers(stargazer_url: str, github_api_key: str) -> pd.DataFrame:
	"""Gives the GitHub username of all stargazers in this query
	by hitting the GitHub API.

	:param stargazer_query: Query object to represent the query
	:param github_api_key: API key for GitHub
	:return: A set of all stargazers
	"""
	headers = {
	"Authorization": f"token {github_api_key}",
	"Accept": "application/vnd.github.v3.star+json",
	}

	response = requests.get(stargazer_url, headers=headers)
	response.raise_for_status() # Raise an exception for unsuccessful requests

	data = response.json()
	records = [
	{
	"user": datum["user"]["login"],
	"starred_at": datetime.strptime(datum["starred_at"], "%Y-%m-%dT%H:%M:%SZ"),
	}
	for datum in data
	]
	return pd.DataFrame.from_records(records)


	@save_to.csv(path=value("unique_stargazers.csv"))
	def unique_stargazers(stargazers: Collect[pd.DataFrame]) -> pd.DataFrame:
	"""Aggregates all stargazers into a single set.

	:param stargazers: Set of stargazers, paginated
	:return: A set of all stargazers
	"""
	df = pd.concat(stargazers)
	unique = df.sort_values("starred_at").groupby("user").first()
	return unique


	def final_count(unique_stargazers: pd.DataFrame) -> int:
	"""Counts the number of unique stargazers.

	:param unique_stargazers: Set of all stargazers
	:return: The number of unique stargazers
	"""
	return len(unique_stargazers)