blob: 058aea78cba76ce51f1477bc75804e5d073cbc48 [file] [log] [blame]
import os
import tempfile
from typing import Generator, Union
import tiktoken
from openai import OpenAI
from pypdf import PdfReader
from hamilton.htypes import Collect, Parallelizable
def openai_client() -> OpenAI:
return OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def raw_text(pdf_source: Union[str, bytes, tempfile.SpooledTemporaryFile]) -> str:
"""Takes a filepath to a PDF and returns a string of the PDF's contents
:param pdf_source: the path, or the temporary file, to the PDF.
:return: the text of the PDF.
"""
reader = PdfReader(pdf_source)
_pdf_text = ""
page_number = 0
for page in reader.pages:
page_number += 1
_pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
return _pdf_text
def tokenizer(tokenizer_encoding: str = "cl100k_base") -> tiktoken.core.Encoding:
"""Get OpenAI tokenizer"""
return tiktoken.get_encoding(tokenizer_encoding)
def _create_chunks(
text: str, tokenizer: tiktoken.core.Encoding, max_length: int
) -> Generator[str, None, None]:
"""Return successive chunks of size `max_length` tokens from provided text.
Split a text into smaller chunks of size n, preferably ending at the end of a sentence
"""
tokens = tokenizer.encode(text)
i = 0
while i < len(tokens):
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
j = min(i + int(1.5 * max_length), len(tokens))
while j > i + int(0.5 * max_length):
# Decode the tokens and check for full stop or newline
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# If no end of sentence found, use n tokens as the chunk size
if j == i + int(0.5 * max_length):
j = min(i + max_length, len(tokens))
yield tokens[i:j]
i = j
def chunked_text(
raw_text: str, tokenizer: tiktoken.core.Encoding, max_token_length: int = 800
) -> list[str]:
"""Tokenize text; create chunks of size `max_token_length`;
for each chunk, convert tokens back to text string
"""
_encoded_chunks = _create_chunks(raw_text, tokenizer, max_token_length)
_decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
return _decoded_chunks
def chunk_to_summarize(chunked_text: list[str]) -> Parallelizable[str]:
"""Iterate over chunks that didn't have a stored summary"""
for chunk in chunked_text:
yield chunk
def _summarize_text__openai(openai_client: OpenAI, prompt: str, openai_gpt_model: str) -> str:
"""Use OpenAI chat API to ask a model to summarize content contained in a prompt"""
response = openai_client.chat.completions.create(
model=openai_gpt_model, messages=[{"role": "user", "content": prompt}], temperature=0
)
return response.choices[0].message.content
def prompt_to_summarize_chunk() -> str:
"""Base prompt for summarize a chunk of text"""
return f"Extract key points with reasoning into a bulleted format.\n\nContent:{{content}}" # noqa: F541
def chunk_summary(
openai_client: OpenAI,
chunk_to_summarize: str,
prompt_to_summarize_chunk: str,
openai_gpt_model: str,
) -> str:
"""Fill a base prompt with a chunk's content and summarize it;
Store the summary in the chunk object
"""
filled_prompt = prompt_to_summarize_chunk.format(content=chunk_to_summarize)
return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)
def prompt_to_reduce_summaries() -> str:
"""Prompt for a "reduce" operation to summarize a list of summaries into a single text"""
return f"""Write a summary from this collection of key points.
First answer the question in two sentences. Then, highlight the core argument, conclusions and evidence.
User query: {{query}}
The summary should be structured in bulleted lists following the headings Answer, Core Argument, Evidence, and Conclusions.
Key points:\n{{chunks_summary}}\nSummary:\n""" # noqa: F541
def chunk_summary_collection(chunk_summary: Collect[str]) -> list[str]:
"""Collect chunks for which a summary was just computed"""
return chunk_summary
def final_summary(
openai_client: OpenAI,
query: str,
chunk_summary_collection: list[str],
prompt_to_reduce_summaries: str,
openai_gpt_model: str,
) -> str:
"""Concatenate the list of chunk summaries into a single text,fill the prompt template,
and use OpenAI to reduce the content into a single summary;
"""
concatenated_summaries = " ".join(chunk_summary_collection)
filled_prompt = prompt_to_reduce_summaries.format(
query=query, chunks_summary=concatenated_summaries
)
return _summarize_text__openai(openai_client, filled_prompt, openai_gpt_model)
if __name__ == "__main__":
import summarization
from hamilton import driver
dr = (
driver.Builder()
.enable_dynamic_execution(allow_experimental_mode=True)
.with_modules(summarization)
.build()
)
dr.display_all_functions("./docs/summary", {"view": False, "format": "png"}, orient="TB")
inputs = dict(
pdf_source="./data/hamilton_paper.pdf",
openai_gpt_model="gpt-3.5-turbo-0613",
query="What are the main benefits of this tool?",
)
results = dr.execute(["final_summary"], inputs=inputs)