diff options
Diffstat (limited to 'autogpt/processing/text.py')
-rw-r--r-- | autogpt/processing/text.py | 245 |
1 files changed, 0 insertions, 245 deletions
diff --git a/autogpt/processing/text.py b/autogpt/processing/text.py deleted file mode 100644 index ddb64df18..000000000 --- a/autogpt/processing/text.py +++ /dev/null @@ -1,245 +0,0 @@ -"""Text processing functions""" -from math import ceil -from typing import Optional - -import spacy -import tiktoken - -from autogpt.config import Config -from autogpt.llm.base import ChatSequence -from autogpt.llm.providers.openai import OPEN_AI_MODELS -from autogpt.llm.utils import count_string_tokens, create_chat_completion -from autogpt.logs import logger -from autogpt.utils import batch - - -def _max_chunk_length(model: str, max: Optional[int] = None) -> int: - model_max_input_tokens = OPEN_AI_MODELS[model].max_tokens - 1 - if max is not None and max > 0: - return min(max, model_max_input_tokens) - return model_max_input_tokens - - -def must_chunk_content( - text: str, for_model: str, max_chunk_length: Optional[int] = None -) -> bool: - return count_string_tokens(text, for_model) > _max_chunk_length( - for_model, max_chunk_length - ) - - -def chunk_content( - content: str, - for_model: str, - max_chunk_length: Optional[int] = None, - with_overlap=True, -): - """Split content into chunks of approximately equal token length.""" - - MAX_OVERLAP = 200 # limit overlap to save tokens - - if not must_chunk_content(content, for_model, max_chunk_length): - yield content, count_string_tokens(content, for_model) - return - - max_chunk_length = max_chunk_length or _max_chunk_length(for_model) - - tokenizer = tiktoken.encoding_for_model(for_model) - - tokenized_text = tokenizer.encode(content) - total_length = len(tokenized_text) - n_chunks = ceil(total_length / max_chunk_length) - - chunk_length = ceil(total_length / n_chunks) - overlap = min(max_chunk_length - chunk_length, MAX_OVERLAP) if with_overlap else 0 - - for token_batch in batch(tokenized_text, chunk_length + overlap, overlap): - yield tokenizer.decode(token_batch), len(token_batch) - - -def summarize_text( - text: str, - config: Config, - instruction: Optional[str] = None, - question: Optional[str] = None, -) -> tuple[str, None | list[tuple[str, str]]]: - """Summarize text using the OpenAI API - - Args: - text (str): The text to summarize - config (Config): The config object - instruction (str): Additional instruction for summarization, e.g. "focus on information related to polar bears", "omit personal information contained in the text" - question (str): Question to answer in the summary - - Returns: - str: The summary of the text - list[(summary, chunk)]: Text chunks and their summary, if the text was chunked. - None otherwise. - """ - if not text: - raise ValueError("No text to summarize") - - if instruction and question: - raise ValueError("Parameters 'question' and 'instructions' cannot both be set") - - model = config.fast_llm - - if question: - instruction = ( - f'include any information that can be used to answer the question "{question}". ' - "Do not directly answer the question itself" - ) - - summarization_prompt = ChatSequence.for_model(model) - - token_length = count_string_tokens(text, model) - logger.info(f"Text length: {token_length} tokens") - - # reserve 50 tokens for summary prompt, 500 for the response - max_chunk_length = _max_chunk_length(model) - 550 - logger.info(f"Max chunk length: {max_chunk_length} tokens") - - if not must_chunk_content(text, model, max_chunk_length): - # summarization_prompt.add("user", text) - summarization_prompt.add( - "user", - "Write a concise summary of the following text" - f"{f'; {instruction}' if instruction is not None else ''}:" - "\n\n\n" - f'LITERAL TEXT: """{text}"""' - "\n\n\n" - "CONCISE SUMMARY: The text is best summarized as" - # "Only respond with a concise summary or description of the user message." - ) - - logger.debug(f"Summarizing with {model}:\n{summarization_prompt.dump()}\n") - summary = create_chat_completion( - prompt=summarization_prompt, config=config, temperature=0, max_tokens=500 - ).content - - logger.debug(f"\n{'-'*16} SUMMARY {'-'*17}\n{summary}\n{'-'*42}\n") - return summary.strip(), None - - summaries: list[str] = [] - chunks = list( - split_text( - text, for_model=model, config=config, max_chunk_length=max_chunk_length - ) - ) - - for i, (chunk, chunk_length) in enumerate(chunks): - logger.info( - f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens" - ) - summary, _ = summarize_text(chunk, config, instruction) - summaries.append(summary) - - logger.info(f"Summarized {len(chunks)} chunks") - - summary, _ = summarize_text("\n\n".join(summaries), config) - - return summary.strip(), [ - (summaries[i], chunks[i][0]) for i in range(0, len(chunks)) - ] - - -def split_text( - text: str, - for_model: str, - config: Config, - with_overlap=True, - max_chunk_length: Optional[int] = None, -): - """Split text into chunks of sentences, with each chunk not exceeding the maximum length - - Args: - text (str): The text to split - for_model (str): The model to chunk for; determines tokenizer and constraints - config (Config): The config object - with_overlap (bool, optional): Whether to allow overlap between chunks - max_chunk_length (int, optional): The maximum length of a chunk - - Yields: - str: The next chunk of text - - Raises: - ValueError: when a sentence is longer than the maximum length - """ - - max_length = _max_chunk_length(for_model, max_chunk_length) - - # flatten paragraphs to improve performance - text = text.replace("\n", " ") - text_length = count_string_tokens(text, for_model) - - if text_length < max_length: - yield text, text_length - return - - n_chunks = ceil(text_length / max_length) - target_chunk_length = ceil(text_length / n_chunks) - - nlp: spacy.language.Language = spacy.load(config.browse_spacy_language_model) - nlp.add_pipe("sentencizer") - doc = nlp(text) - sentences = [sentence.text.strip() for sentence in doc.sents] - - current_chunk: list[str] = [] - current_chunk_length = 0 - last_sentence = None - last_sentence_length = 0 - - i = 0 - while i < len(sentences): - sentence = sentences[i] - sentence_length = count_string_tokens(sentence, for_model) - expected_chunk_length = current_chunk_length + 1 + sentence_length - - if ( - expected_chunk_length < max_length - # try to create chunks of approximately equal size - and expected_chunk_length - (sentence_length / 2) < target_chunk_length - ): - current_chunk.append(sentence) - current_chunk_length = expected_chunk_length - - elif sentence_length < max_length: - if last_sentence: - yield " ".join(current_chunk), current_chunk_length - current_chunk = [] - current_chunk_length = 0 - - if with_overlap: - overlap_max_length = max_length - sentence_length - 1 - if last_sentence_length < overlap_max_length: - current_chunk += [last_sentence] - current_chunk_length += last_sentence_length + 1 - elif overlap_max_length > 5: - # add as much from the end of the last sentence as fits - current_chunk += [ - list( - chunk_content( - last_sentence, - for_model, - overlap_max_length, - ) - ).pop()[0], - ] - current_chunk_length += overlap_max_length + 1 - - current_chunk += [sentence] - current_chunk_length += sentence_length - - else: # sentence longer than maximum length -> chop up and try again - sentences[i : i + 1] = [ - chunk - for chunk, _ in chunk_content(sentence, for_model, target_chunk_length) - ] - continue - - i += 1 - last_sentence = sentence - last_sentence_length = sentence_length - - if current_chunk: - yield " ".join(current_chunk), current_chunk_length |