examples/LLM_Workflows/knowledge_retrieval/arxiv_articles.py - hamilton - Git at Google

 import os.path
 from typing import Dict

 import arxiv
 import openai
 import pandas as pd

 from hamilton.function_modifiers import extract_fields
 from hamilton.htypes import Collect, Parallelizable


 def arxiv_search_result(
     article_query: str,
     max_arxiv_results: int,
     sort_by: arxiv.SortCriterion.Relevance = arxiv.SortCriterion.Relevance,
 ) -> Parallelizable[arxiv.Result]:
     """Goes to arxiv and returns a list articles that match the provided query.

     :param article_query: the query to search for.
     :param max_arxiv_results: the maximum number of results to return.
     :param sort_by: sort the results by this criterion.
     :return: list of arxiv.Result objects.
     """
     _search = arxiv.Search(
         query=article_query,
         max_results=max_arxiv_results,
         sort_by=sort_by,
     )
     for item in _search.results():
         yield item


 @extract_fields({"title": str, "summary": str, "article_url": str, "pdf_url": str})
 def result(arxiv_search_result: arxiv.Result) -> Dict[str, str]:
     return {
         "title": arxiv_search_result.title,
         "summary": arxiv_search_result.summary,
         "article_url": arxiv_search_result.links[0],
         "pdf_url": arxiv_search_result.links[1],
     }
     pass


 def arxiv_result_embedding(
     title: str,
     embedding_model_name: str,
 ) -> list[float]:
     """Generates an embedding, indexed by title for each arxiv search result.

     :param title of the arxiv search result:
     :param embedding_model_name: the name of the embedding model to use.
     :return: the embedding as a vector of floats
     """
     response = openai.Embedding.create(input=title, model=embedding_model_name)
     return response["data"][0]["embedding"]


 def arxiv_pdf(arxiv_search_result: arxiv.Result, data_dir: str) -> str:
     """Processes the arxiv search results and downloads the PDFs.

     :param arxiv_search_result: search result object.
     :param data_dir: the directory to save the PDFs to.
     :return: The filepath to the pdf after downloading
     """
     if not os.path.exists(data_dir):
         os.path.makedirs(data_dir)
     return arxiv_search_result.download_pdf(dirpath=data_dir)


 def arxiv_processed_result(
     title: str,
     summary: str,
     article_url: str,
     pdf_url: str,
     arxiv_pdf: str,
     arxiv_result_embedding: list[float],
 ) -> Dict[str, str]:
     """creates dict with parameters as keys/values"""
     return {
         "title": title,
         "summary": summary,
         "article_url": article_url,
         "pdf_url": pdf_url,
         "arxiv_pdfs": arxiv_pdf,
         "arxiv_result_embeddings": arxiv_result_embedding,
     }


 def arxiv_result_df(arxiv_processed_result: Collect[Dict[str, str]]) -> pd.DataFrame:
     """Joins the arxiv results back to a dataframe.

     :param arxiv_processed_result: result of all the joined arxiv result information
     :return:  a dataframe with the arxiv results.
     """
     all_results = list(arxiv_processed_result)
     return pd.DataFrame.from_records(all_results).set_index("title", drop=False)


 def save_arxiv_result_df(arxiv_result_df: pd.DataFrame, library_file_path: str) -> dict:
     """Saves the arxiv result dataframe to a csv file.

     Appends if it already exists. Does not protect against duplicates.

     :param arxiv_result_df: the dataframe to save.
     :param library_file_path: the path to the library file.
     :return: a dictionary with the number of articles newly written.
     """
     if os.path.exists(library_file_path):
         arxiv_result_df.to_csv(library_file_path, mode="a", header=False, index=False)
     else:
         arxiv_result_df.to_csv(library_file_path, index=False)
     return {"num_articles_written": len(arxiv_result_df)}
	import os.path
	from typing import Dict

	import arxiv
	import openai
	import pandas as pd

	from hamilton.function_modifiers import extract_fields
	from hamilton.htypes import Collect, Parallelizable


	def arxiv_search_result(
	article_query: str,
	max_arxiv_results: int,
	sort_by: arxiv.SortCriterion.Relevance = arxiv.SortCriterion.Relevance,
	) -> Parallelizable[arxiv.Result]:
	"""Goes to arxiv and returns a list articles that match the provided query.

	:param article_query: the query to search for.
	:param max_arxiv_results: the maximum number of results to return.
	:param sort_by: sort the results by this criterion.
	:return: list of arxiv.Result objects.
	"""
	_search = arxiv.Search(
	query=article_query,
	max_results=max_arxiv_results,
	sort_by=sort_by,
	)
	for item in _search.results():
	yield item


	@extract_fields({"title": str, "summary": str, "article_url": str, "pdf_url": str})
	def result(arxiv_search_result: arxiv.Result) -> Dict[str, str]:
	return {
	"title": arxiv_search_result.title,
	"summary": arxiv_search_result.summary,
	"article_url": arxiv_search_result.links[0],
	"pdf_url": arxiv_search_result.links[1],
	}
	pass


	def arxiv_result_embedding(
	title: str,
	embedding_model_name: str,
	) -> list[float]:
	"""Generates an embedding, indexed by title for each arxiv search result.

	:param title of the arxiv search result:
	:param embedding_model_name: the name of the embedding model to use.
	:return: the embedding as a vector of floats
	"""
	response = openai.Embedding.create(input=title, model=embedding_model_name)
	return response["data"][0]["embedding"]


	def arxiv_pdf(arxiv_search_result: arxiv.Result, data_dir: str) -> str:
	"""Processes the arxiv search results and downloads the PDFs.

	:param arxiv_search_result: search result object.
	:param data_dir: the directory to save the PDFs to.
	:return: The filepath to the pdf after downloading
	"""
	if not os.path.exists(data_dir):
	os.path.makedirs(data_dir)
	return arxiv_search_result.download_pdf(dirpath=data_dir)


	def arxiv_processed_result(
	title: str,
	summary: str,
	article_url: str,
	pdf_url: str,
	arxiv_pdf: str,
	arxiv_result_embedding: list[float],
	) -> Dict[str, str]:
	"""creates dict with parameters as keys/values"""
	return {
	"title": title,
	"summary": summary,
	"article_url": article_url,
	"pdf_url": pdf_url,
	"arxiv_pdfs": arxiv_pdf,
	"arxiv_result_embeddings": arxiv_result_embedding,
	}


	def arxiv_result_df(arxiv_processed_result: Collect[Dict[str, str]]) -> pd.DataFrame:
	"""Joins the arxiv results back to a dataframe.

	:param arxiv_processed_result: result of all the joined arxiv result information
	:return: a dataframe with the arxiv results.
	"""
	all_results = list(arxiv_processed_result)
	return pd.DataFrame.from_records(all_results).set_index("title", drop=False)


	def save_arxiv_result_df(arxiv_result_df: pd.DataFrame, library_file_path: str) -> dict:
	"""Saves the arxiv result dataframe to a csv file.

	Appends if it already exists. Does not protect against duplicates.

	:param arxiv_result_df: the dataframe to save.
	:param library_file_path: the path to the library file.
	:return: a dictionary with the number of articles newly written.
	"""
	if os.path.exists(library_file_path):
	arxiv_result_df.to_csv(library_file_path, mode="a", header=False, index=False)
	else:
	arxiv_result_df.to_csv(library_file_path, index=False)
	return {"num_articles_written": len(arxiv_result_df)}