examples/LLM_Workflows/llm_logging/summarization.py - hamilton - Git at Google

 import os
 import tempfile
 from typing import Generator, Union

 import tiktoken
 from openai import OpenAI
 from pypdf import PdfReader

 from hamilton.htypes import Collect, Parallelizable


 def openai_client() -> OpenAI:
     return OpenAI(api_key=os.environ["OPENAI_API_KEY"])


 def raw_text(pdf_source: Union[str, bytes, tempfile.SpooledTemporaryFile]) -> str:
     """Takes a filepath to a PDF and returns a string of the PDF's contents
     :param pdf_source: the path, or the temporary file, to the PDF.
     :return: the text of the PDF.
     """
     reader = PdfReader(pdf_source)
     _pdf_text = ""
     page_number = 0
     for page in reader.pages:
         page_number += 1
         _pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
     return _pdf_text


 def tokenizer(tokenizer_encoding: str = "cl100k_base") -> tiktoken.core.Encoding:
     """Get OpenAI tokenizer"""
     return tiktoken.get_encoding(tokenizer_encoding)


 def _create_chunks(
     text: str, tokenizer: tiktoken.core.Encoding, max_length: int
 ) -> Generator[str, None, None]:
     """Return successive chunks of size `max_length` tokens from provided text.
     Split a text into smaller chunks of size n, preferably ending at the end of a sentence
     """
     tokens = tokenizer.encode(text)
     i = 0
     while i < len(tokens):
         # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
         j = min(i + int(1.5 * max_length), len(tokens))
         while j > i + int(0.5 * max_length):
             # Decode the tokens and check for full stop or newline
             chunk = tokenizer.decode(tokens[i:j])
             if chunk.endswith(".") or chunk.endswith("\n"):
                 break
             j -= 1
         # If no end of sentence found, use n tokens as the chunk size
         if j == i + int(0.5 * max_length):
             j = min(i + max_length, len(tokens))
         yield tokens[i:j]
         i = j


 def chunked_text(
     raw_text: str, tokenizer: tiktoken.core.Encoding, max_token_length: int = 800
 ) -> list[str]:
     """Tokenize text; create chunks of size `max_token_length`;
     for each chunk, convert tokens back to text string
     """
     _encoded_chunks = _create_chunks(raw_text, tokenizer, max_token_length)
     _decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
     return _decoded_chunks


 def chunk_to_summarize(chunked_text: list[str]) -> Parallelizable[str]:
     """Iterate over chunks that didn't have a stored summary"""
     for chunk in chunked_text:
         yield chunk


 def _summarize_text__openai(openai_client: OpenAI, prompt: str, openai_gpt_model: str) -> str:
     """Use OpenAI chat API to ask a model to summarize content contained in a prompt"""
     response = openai_client.chat.completions.create(
         model=openai_gpt_model, messages=[{"role": "user", "content": prompt}], temperature=0
     )
     return response.choices[0].message.content


 def prompt_to_summarize_chunk() -> str:
     """Base prompt for summarize a chunk of text"""
     return f"Extract key points with reasoning into a bulleted format.\n\nContent:{{content}}"  # noqa: F541


 def chunk_summary(
     openai_client: OpenAI,
     chunk_to_summarize: str,
     prompt_to_summarize_chunk: str,
     openai_gpt_model: str,
 ) -> str:
     """Fill a base prompt with a chunk's content and summarize it;
     Store the summary in the chunk object
     """
     filled_prompt = prompt_to_summarize_chunk.format(content=chunk_to_summarize)
     return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)


 def prompt_to_reduce_summaries() -> str:
     """Prompt for a "reduce" operation to summarize a list of summaries into a single text"""
     return f"""Write a summary from this collection of key points.
     First answer the question in two sentences. Then, highlight the core argument, conclusions and evidence.
     User query: {{query}}
     The summary should be structured in bulleted lists following the headings Answer, Core Argument, Evidence, and Conclusions.
     Key points:\n{{chunks_summary}}\nSummary:\n"""  # noqa: F541


 def chunk_summary_collection(chunk_summary: Collect[str]) -> list[str]:
     """Collect chunks for which a summary was just computed"""
     return chunk_summary


 def final_summary(
     openai_client: OpenAI,
     query: str,
     chunk_summary_collection: list[str],
     prompt_to_reduce_summaries: str,
     openai_gpt_model: str,
 ) -> str:
     """Concatenate the list of chunk summaries into a single text,fill the prompt template,
     and use OpenAI to reduce the content into a single summary;
     """
     concatenated_summaries = " ".join(chunk_summary_collection)
     filled_prompt = prompt_to_reduce_summaries.format(
         query=query, chunks_summary=concatenated_summaries
     )
     return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)


 if __name__ == "__main__":
     import summarization

     from hamilton import driver

     dr = (
         driver.Builder()
         .enable_dynamic_execution(allow_experimental_mode=True)
         .with_modules(summarization)
         .build()
     )
     dr.display_all_functions("./docs/summary", {"view": False, "format": "png"}, orient="TB")

     inputs = dict(
         pdf_source="./data/hamilton_paper.pdf",
         openai_gpt_model="gpt-3.5-turbo-0613",
         query="What are the main benefits of this tool?",
     )

     results = dr.execute(["final_summary"], inputs=inputs)
	import os
	import tempfile
	from typing import Generator, Union

	import tiktoken
	from openai import OpenAI
	from pypdf import PdfReader

	from hamilton.htypes import Collect, Parallelizable


	def openai_client() -> OpenAI:
	return OpenAI(api_key=os.environ["OPENAI_API_KEY"])


	def raw_text(pdf_source: Union[str, bytes, tempfile.SpooledTemporaryFile]) -> str:
	"""Takes a filepath to a PDF and returns a string of the PDF's contents
	:param pdf_source: the path, or the temporary file, to the PDF.
	:return: the text of the PDF.
	"""
	reader = PdfReader(pdf_source)
	_pdf_text = ""
	page_number = 0
	for page in reader.pages:
	page_number += 1
	_pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
	return _pdf_text


	def tokenizer(tokenizer_encoding: str = "cl100k_base") -> tiktoken.core.Encoding:
	"""Get OpenAI tokenizer"""
	return tiktoken.get_encoding(tokenizer_encoding)


	def _create_chunks(
	text: str, tokenizer: tiktoken.core.Encoding, max_length: int
	) -> Generator[str, None, None]:
	"""Return successive chunks of size `max_length` tokens from provided text.
	Split a text into smaller chunks of size n, preferably ending at the end of a sentence
	"""
	tokens = tokenizer.encode(text)
	i = 0
	while i < len(tokens):
	# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
	j = min(i + int(1.5 * max_length), len(tokens))
	while j > i + int(0.5 * max_length):
	# Decode the tokens and check for full stop or newline
	chunk = tokenizer.decode(tokens[i:j])
	if chunk.endswith(".") or chunk.endswith("\n"):
	break
	j -= 1
	# If no end of sentence found, use n tokens as the chunk size
	if j == i + int(0.5 * max_length):
	j = min(i + max_length, len(tokens))
	yield tokens[i:j]
	i = j


	def chunked_text(
	raw_text: str, tokenizer: tiktoken.core.Encoding, max_token_length: int = 800
	) -> list[str]:
	"""Tokenize text; create chunks of size `max_token_length`;
	for each chunk, convert tokens back to text string
	"""
	_encoded_chunks = _create_chunks(raw_text, tokenizer, max_token_length)
	_decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
	return _decoded_chunks


	def chunk_to_summarize(chunked_text: list[str]) -> Parallelizable[str]:
	"""Iterate over chunks that didn't have a stored summary"""
	for chunk in chunked_text:
	yield chunk


	def _summarize_text__openai(openai_client: OpenAI, prompt: str, openai_gpt_model: str) -> str:
	"""Use OpenAI chat API to ask a model to summarize content contained in a prompt"""
	response = openai_client.chat.completions.create(
	model=openai_gpt_model, messages=[{"role": "user", "content": prompt}], temperature=0
	)
	return response.choices[0].message.content


	def prompt_to_summarize_chunk() -> str:
	"""Base prompt for summarize a chunk of text"""
	return f"Extract key points with reasoning into a bulleted format.\n\nContent:{{content}}" # noqa: F541


	def chunk_summary(
	openai_client: OpenAI,
	chunk_to_summarize: str,
	prompt_to_summarize_chunk: str,
	openai_gpt_model: str,
	) -> str:
	"""Fill a base prompt with a chunk's content and summarize it;
	Store the summary in the chunk object
	"""
	filled_prompt = prompt_to_summarize_chunk.format(content=chunk_to_summarize)
	return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)


	def prompt_to_reduce_summaries() -> str:
	"""Prompt for a "reduce" operation to summarize a list of summaries into a single text"""
	return f"""Write a summary from this collection of key points.
	First answer the question in two sentences. Then, highlight the core argument, conclusions and evidence.
	User query: {{query}}
	The summary should be structured in bulleted lists following the headings Answer, Core Argument, Evidence, and Conclusions.
	Key points:\n{{chunks_summary}}\nSummary:\n""" # noqa: F541


	def chunk_summary_collection(chunk_summary: Collect[str]) -> list[str]:
	"""Collect chunks for which a summary was just computed"""
	return chunk_summary


	def final_summary(
	openai_client: OpenAI,
	query: str,
	chunk_summary_collection: list[str],
	prompt_to_reduce_summaries: str,
	openai_gpt_model: str,
	) -> str:
	"""Concatenate the list of chunk summaries into a single text,fill the prompt template,
	and use OpenAI to reduce the content into a single summary;
	"""
	concatenated_summaries = " ".join(chunk_summary_collection)
	filled_prompt = prompt_to_reduce_summaries.format(
	query=query, chunks_summary=concatenated_summaries
	)
	return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)


	if __name__ == "__main__":
	import summarization

	from hamilton import driver

	dr = (
	driver.Builder()
	.enable_dynamic_execution(allow_experimental_mode=True)
	.with_modules(summarization)
	.build()
	)
	dr.display_all_functions("./docs/summary", {"view": False, "format": "png"}, orient="TB")

	inputs = dict(
	pdf_source="./data/hamilton_paper.pdf",
	openai_gpt_model="gpt-3.5-turbo-0613",
	query="What are the main benefits of this tool?",
	)

	results = dr.execute(["final_summary"], inputs=inputs)