diff options
author | Reinier van der Leer <pwuts@agpt.co> | 2023-12-12 17:41:04 +0100 |
---|---|---|
committer | Reinier van der Leer <pwuts@agpt.co> | 2023-12-12 17:41:55 +0100 |
commit | d95e3b5b54bcac00541bbac1e69644c73970b72f (patch) | |
tree | f5df5cd23a06761af0706b03ab1b302e7b942547 | |
parent | fix(agent/file_operations): Fix path processing in file_operations.py and acr... (diff) | |
download | Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.gz Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.bz2 Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.zip |
refactor(agent/file_operations): Refactor file opening/reading and parsing
- Update the signature of `FileWorkspace.open_file` and fix implementations in every workspace backend
- Replace `open()` with `workspace.open_file` in the `read_file` command to use the workspace's file opening functionality
- Fix the parametrization of the `test_text_file_parsers` test to correctly test text file parsers
-rw-r--r-- | autogpts/autogpt/autogpt/commands/file_operations.py | 9 | ||||
-rw-r--r-- | autogpts/autogpt/autogpt/commands/file_operations_utils.py | 105 | ||||
-rw-r--r-- | autogpts/autogpt/autogpt/file_workspace/base.py | 21 | ||||
-rw-r--r-- | autogpts/autogpt/autogpt/file_workspace/gcs.py | 33 | ||||
-rw-r--r-- | autogpts/autogpt/autogpt/file_workspace/local.py | 20 | ||||
-rw-r--r-- | autogpts/autogpt/autogpt/file_workspace/s3.py | 16 | ||||
-rw-r--r-- | autogpts/autogpt/autogpt/models/context_item.py | 5 | ||||
-rw-r--r-- | autogpts/autogpt/tests/unit/test_text_file_parsers.py | 25 |
8 files changed, 124 insertions, 110 deletions
diff --git a/autogpts/autogpt/autogpt/commands/file_operations.py b/autogpts/autogpt/autogpt/commands/file_operations.py index 6f4986105..9d70b3874 100644 --- a/autogpts/autogpt/autogpt/commands/file_operations.py +++ b/autogpts/autogpt/autogpt/commands/file_operations.py @@ -16,7 +16,7 @@ from autogpt.core.utils.json_schema import JSONSchema from autogpt.memory.vector import MemoryItemFactory, VectorMemory from .decorators import sanitize_path_arg -from .file_operations_utils import read_textual_file +from .file_operations_utils import decode_textual_file COMMAND_CATEGORY = "file_operations" COMMAND_CATEGORY_TITLE = "File Operations" @@ -140,8 +140,7 @@ def log_operation( ) }, ) -@sanitize_path_arg("filename") -def read_file(filename: Path, agent: Agent) -> str: +def read_file(filename: str | Path, agent: Agent) -> str: """Read a file and return the contents Args: @@ -150,8 +149,8 @@ def read_file(filename: Path, agent: Agent) -> str: Returns: str: The contents of the file """ - content = read_textual_file(filename, logger) - # TODO: content = agent.workspace.read_file(filename) + file = agent.workspace.open_file(filename, binary=True) + content = decode_textual_file(file, logger) # # TODO: invalidate/update memory when file is edited # file_memory = MemoryItem.from_text_file(content, str(filename), agent.config) diff --git a/autogpts/autogpt/autogpt/commands/file_operations_utils.py b/autogpts/autogpt/autogpt/commands/file_operations_utils.py index 6b24a6f11..e7c001e93 100644 --- a/autogpts/autogpt/autogpt/commands/file_operations_utils.py +++ b/autogpts/autogpt/autogpt/commands/file_operations_utils.py @@ -1,11 +1,11 @@ import json import logging import os -from pathlib import Path +from abc import ABC, abstractmethod +from typing import BinaryIO import charset_normalizer import docx -import markdown import pypdf import yaml from bs4 import BeautifulSoup @@ -14,23 +14,24 @@ from pylatexenc.latex2text import LatexNodes2Text logger = logging.getLogger(__name__) -class ParserStrategy: - def read(self, file_path: Path) -> str: - raise NotImplementedError +class ParserStrategy(ABC): + @abstractmethod + def read(self, file: BinaryIO) -> str: + ... # Basic text file reading class TXTParser(ParserStrategy): - def read(self, file_path: Path) -> str: - charset_match = charset_normalizer.from_path(file_path).best() - logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'") + def read(self, file: BinaryIO) -> str: + charset_match = charset_normalizer.from_bytes(file.read()).best() + logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'") return str(charset_match) # Reading text from binary file using pdf parser class PDFParser(ParserStrategy): - def read(self, file_path: Path) -> str: - parser = pypdf.PdfReader(file_path) + def read(self, file: BinaryIO) -> str: + parser = pypdf.PdfReader(file) text = "" for page_idx in range(len(parser.pages)): text += parser.pages[page_idx].extract_text() @@ -39,8 +40,8 @@ class PDFParser(ParserStrategy): # Reading text from binary file using docs parser class DOCXParser(ParserStrategy): - def read(self, file_path: Path) -> str: - doc_file = docx.Document(file_path) + def read(self, file: BinaryIO) -> str: + doc_file = docx.Document(file) text = "" for para in doc_file.paragraphs: text += para.text @@ -49,50 +50,37 @@ class DOCXParser(ParserStrategy): # Reading as dictionary and returning string format class JSONParser(ParserStrategy): - def read(self, file_path: Path) -> str: - with open(file_path, "r") as f: - data = json.load(f) - text = str(data) + def read(self, file: BinaryIO) -> str: + data = json.load(file) + text = str(data) return text class XMLParser(ParserStrategy): - def read(self, file_path: Path) -> str: - with open(file_path, "r") as f: - soup = BeautifulSoup(f, "xml") - text = soup.get_text() + def read(self, file: BinaryIO) -> str: + soup = BeautifulSoup(file, "xml") + text = soup.get_text() return text # Reading as dictionary and returning string format class YAMLParser(ParserStrategy): - def read(self, file_path: Path) -> str: - with open(file_path, "r") as f: - data = yaml.load(f, Loader=yaml.FullLoader) - text = str(data) + def read(self, file: BinaryIO) -> str: + data = yaml.load(file, Loader=yaml.FullLoader) + text = str(data) return text class HTMLParser(ParserStrategy): - def read(self, file_path: Path) -> str: - with open(file_path, "r") as f: - soup = BeautifulSoup(f, "html.parser") - text = soup.get_text() - return text - - -class MarkdownParser(ParserStrategy): - def read(self, file_path: Path) -> str: - with open(file_path, "r") as f: - html = markdown.markdown(f.read()) - text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True)) + def read(self, file: BinaryIO) -> str: + soup = BeautifulSoup(file, "html.parser") + text = soup.get_text() return text class LaTeXParser(ParserStrategy): - def read(self, file_path: Path) -> str: - with open(file_path, "r") as f: - latex = f.read() + def read(self, file: BinaryIO) -> str: + latex = file.read().decode() text = LatexNodes2Text().latex_to_text(latex) return text @@ -106,13 +94,15 @@ class FileContext: self.logger.debug(f"Setting Context Parser to {parser}") self.parser = parser - def read_file(self, file_path) -> str: - self.logger.debug(f"Reading file {file_path} with parser {self.parser}") - return self.parser.read(file_path) + def decode_file(self, file: BinaryIO) -> str: + self.logger.debug(f"Reading file {file.name} with parser {self.parser}") + return self.parser.read(file) extension_to_parser = { ".txt": TXTParser(), + ".md": TXTParser(), + ".markdown": TXTParser(), ".csv": TXTParser(), ".pdf": PDFParser(), ".docx": DOCXParser(), @@ -123,47 +113,36 @@ extension_to_parser = { ".html": HTMLParser(), ".htm": HTMLParser(), ".xhtml": HTMLParser(), - ".md": MarkdownParser(), - ".markdown": MarkdownParser(), ".tex": LaTeXParser(), } -def is_file_binary_fn(file_path: Path): +def is_file_binary_fn(file: BinaryIO): """Given a file path load all its content and checks if the null bytes is present Args: - file_path (_type_): _description_ + file (_type_): _description_ Returns: bool: is_binary """ - with open(file_path, "rb") as f: - file_data = f.read() + file_data = file.read() + file.seek(0) if b"\x00" in file_data: return True return False -def read_textual_file(file_path: Path, logger: logging.Logger) -> str: - if not file_path.is_absolute(): - raise ValueError("File path must be absolute") - - if not file_path.is_file(): - if not file_path.exists(): - raise FileNotFoundError( - f"read_file {file_path} failed: no such file or directory" - ) - else: - raise ValueError(f"read_file failed: {file_path} is not a file") +def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str: + if not file.readable(): + raise ValueError(f"read_file failed: {file.name} is not a file") - is_binary = is_file_binary_fn(file_path) - file_extension = os.path.splitext(file_path)[1].lower() + file_extension = os.path.splitext(file.name)[1].lower() parser = extension_to_parser.get(file_extension) if not parser: - if is_binary: + if is_file_binary_fn(file): raise ValueError(f"Unsupported binary file format: {file_extension}") # fallback to txt file parser (to support script and code files loading) parser = TXTParser() file_context = FileContext(parser, logger) - return file_context.read_file(file_path) + return file_context.decode_file(file) diff --git a/autogpts/autogpt/autogpt/file_workspace/base.py b/autogpts/autogpt/autogpt/file_workspace/base.py index 2e4f43475..865b34dd5 100644 --- a/autogpts/autogpt/autogpt/file_workspace/base.py +++ b/autogpts/autogpt/autogpt/file_workspace/base.py @@ -5,8 +5,9 @@ from __future__ import annotations import logging from abc import ABC, abstractmethod +from io import IOBase, TextIOBase from pathlib import Path -from typing import Any, Callable, Literal, Optional, overload +from typing import IO, Any, BinaryIO, Callable, Literal, Optional, TextIO, overload from autogpt.core.configuration.schema import SystemConfiguration @@ -47,9 +48,23 @@ class FileWorkspace(ABC): doesn't exist yet. E.g. a folder on disk, or an S3 Bucket. """ + @overload + @abstractmethod + def open_file( + self, path: str | Path, binary: Literal[False] = False + ) -> TextIO | TextIOBase: + """Returns a readable text file-like object representing the file.""" + + @overload + @abstractmethod + def open_file( + self, path: str | Path, binary: Literal[True] = True + ) -> BinaryIO | IOBase: + """Returns a readable binary file-like object representing the file.""" + @abstractmethod - def open_file(self, path: str | Path, mode: str = "r"): - """Open a file in the workspace.""" + def open_file(self, path: str | Path, binary: bool = False) -> IO | IOBase: + """Returns a readable file-like object representing the file.""" @overload @abstractmethod diff --git a/autogpts/autogpt/autogpt/file_workspace/gcs.py b/autogpts/autogpt/autogpt/file_workspace/gcs.py index bca410451..3a6b9772c 100644 --- a/autogpts/autogpt/autogpt/file_workspace/gcs.py +++ b/autogpts/autogpt/autogpt/file_workspace/gcs.py @@ -6,6 +6,7 @@ from __future__ import annotations import inspect import logging +from io import IOBase from pathlib import Path from google.cloud import storage @@ -40,7 +41,7 @@ class GCSFileWorkspace(FileWorkspace): return self._root @property - def restrict_to_root(self): + def restrict_to_root(self) -> bool: """Whether to restrict generated paths to the root.""" return True @@ -50,26 +51,28 @@ class GCSFileWorkspace(FileWorkspace): def get_path(self, relative_path: str | Path) -> Path: return super().get_path(relative_path).relative_to("/") - def open_file(self, path: str | Path, mode: str = "r"): - """Open a file in the workspace.""" + def _get_blob(self, path: str | Path) -> storage.Blob: path = self.get_path(path) - blob = self._bucket.blob(str(path)) - return blob + return self._bucket.blob(str(path)) + + def open_file(self, path: str | Path, binary: bool = False) -> IOBase: + """Open a file in the workspace.""" + blob = self._get_blob(path) + blob.reload() # pin revision number to prevent version mixing while reading + return blob.open("rb" if binary else "r") def read_file(self, path: str | Path, binary: bool = False) -> str | bytes: """Read a file in the workspace.""" - blob = self.open_file(path, "r") - file_content = ( - blob.download_as_text() if not binary else blob.download_as_bytes() - ) - return file_content + return self.open_file(path, binary).read() - async def write_file(self, path: str | Path, content: str | bytes): + async def write_file(self, path: str | Path, content: str | bytes) -> None: """Write to a file in the workspace.""" - blob = self.open_file(path, "w") - blob.upload_from_string(content) if isinstance( - content, str - ) else blob.upload_from_file(content) + blob = self._get_blob(path) + + if isinstance(content, str): + blob.upload_from_string(content) + else: + blob.upload_from_file(content) if self.on_write_file: path = Path(path) diff --git a/autogpts/autogpt/autogpt/file_workspace/local.py b/autogpts/autogpt/autogpt/file_workspace/local.py index 347e29667..8c2aa6521 100644 --- a/autogpts/autogpt/autogpt/file_workspace/local.py +++ b/autogpts/autogpt/autogpt/file_workspace/local.py @@ -6,6 +6,7 @@ from __future__ import annotations import inspect import logging from pathlib import Path +from typing import IO from .base import FileWorkspace, FileWorkspaceConfiguration @@ -26,26 +27,29 @@ class LocalFileWorkspace(FileWorkspace): return self._root @property - def restrict_to_root(self): + def restrict_to_root(self) -> bool: """Whether to restrict generated paths to the root.""" return self._restrict_to_root def initialize(self) -> None: self.root.mkdir(exist_ok=True, parents=True) - def open_file(self, path: str | Path, mode: str = "r"): + def open_file(self, path: str | Path, binary: bool = False) -> IO: """Open a file in the workspace.""" + return self._open_file(path, "rb" if binary else "r") + + def _open_file(self, path: str | Path, mode: str = "r") -> IO: full_path = self.get_path(path) - return open(full_path, mode) + return open(full_path, mode) # type: ignore - def read_file(self, path: str | Path, binary: bool = False): + def read_file(self, path: str | Path, binary: bool = False) -> str | bytes: """Read a file in the workspace.""" - with self.open_file(path, "rb" if binary else "r") as file: + with self._open_file(path, "rb" if binary else "r") as file: return file.read() - async def write_file(self, path: str | Path, content: str | bytes): + async def write_file(self, path: str | Path, content: str | bytes) -> None: """Write to a file in the workspace.""" - with self.open_file(path, "wb" if type(content) is bytes else "w") as file: + with self._open_file(path, "wb" if type(content) is bytes else "w") as file: file.write(content) if self.on_write_file: @@ -61,7 +65,7 @@ class LocalFileWorkspace(FileWorkspace): path = self.get_path(path) return [file.relative_to(path) for file in path.rglob("*") if file.is_file()] - def delete_file(self, path: str | Path): + def delete_file(self, path: str | Path) -> None: """Delete a file in the workspace.""" full_path = self.get_path(path) full_path.unlink() diff --git a/autogpts/autogpt/autogpt/file_workspace/s3.py b/autogpts/autogpt/autogpt/file_workspace/s3.py index d52d9388f..79f3ae74e 100644 --- a/autogpts/autogpt/autogpt/file_workspace/s3.py +++ b/autogpts/autogpt/autogpt/file_workspace/s3.py @@ -8,6 +8,7 @@ import contextlib import inspect import logging import os +from io import IOBase, TextIOWrapper from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -74,22 +75,27 @@ class S3FileWorkspace(FileWorkspace): def get_path(self, relative_path: str | Path) -> Path: return super().get_path(relative_path).relative_to("/") - def open_file(self, path: str | Path, mode: str = "r"): - """Open a file in the workspace.""" + def _get_obj(self, path: str | Path) -> mypy_boto3_s3.service_resource.Object: + """Get an S3 object.""" path = self.get_path(path) obj = self._bucket.Object(str(path)) with contextlib.suppress(botocore.exceptions.ClientError): obj.load() return obj + def open_file(self, path: str | Path, binary: bool = False) -> IOBase: + """Open a file in the workspace.""" + obj = self._get_obj(path) + return obj.get()["Body"] if not binary else TextIOWrapper(obj.get()["Body"]) + def read_file(self, path: str | Path, binary: bool = False) -> str | bytes: """Read a file in the workspace.""" - file_content = self.open_file(path, "r").get()["Body"].read() + file_content = self.open_file(path, binary).read() return file_content if binary else file_content.decode() - async def write_file(self, path: str | Path, content: str | bytes): + async def write_file(self, path: str | Path, content: str | bytes) -> None: """Write to a file in the workspace.""" - obj = self.open_file(path, "w") + obj = self._get_obj(path) obj.put(Body=content) if self.on_write_file: diff --git a/autogpts/autogpt/autogpt/models/context_item.py b/autogpts/autogpt/autogpt/models/context_item.py index 17a2a0b8e..0e5d9a373 100644 --- a/autogpts/autogpt/autogpt/models/context_item.py +++ b/autogpts/autogpt/autogpt/models/context_item.py @@ -5,7 +5,7 @@ from typing import Optional from pydantic import BaseModel, Field -from autogpt.commands.file_operations_utils import read_textual_file +from autogpt.commands.file_operations_utils import decode_textual_file logger = logging.getLogger(__name__) @@ -56,7 +56,8 @@ class FileContextItem(BaseModel, ContextItem): @property def content(self) -> str: - return read_textual_file(self.file_path, logger) + with open(self.file_path, "rb") as file: + return decode_textual_file(file, logger) class FolderContextItem(BaseModel, ContextItem): diff --git a/autogpts/autogpt/tests/unit/test_text_file_parsers.py b/autogpts/autogpt/tests/unit/test_text_file_parsers.py index 3b1231489..d17de9c8c 100644 --- a/autogpts/autogpt/tests/unit/test_text_file_parsers.py +++ b/autogpts/autogpt/tests/unit/test_text_file_parsers.py @@ -5,10 +5,14 @@ from pathlib import Path from xml.etree import ElementTree import docx +import pytest import yaml from bs4 import BeautifulSoup -from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file +from autogpt.commands.file_operations_utils import ( + decode_textual_file, + is_file_binary_fn, +) logger = logging.getLogger(__name__) @@ -148,15 +152,18 @@ respective_file_creation_functions = { binary_files_extensions = [".pdf", ".docx"] -def test_parsers(): - for ( - file_extension, - c_file_creator, - ) in respective_file_creation_functions.items(): - created_file_path = Path(c_file_creator()) - loaded_text = read_textual_file(created_file_path, logger) +@pytest.mark.parametrize( + "file_extension, c_file_creator", + respective_file_creation_functions.items(), +) +def test_parsers(file_extension, c_file_creator): + created_file_path = Path(c_file_creator()) + with open(created_file_path, "rb") as file: + loaded_text = decode_textual_file(file, logger) assert plain_text_str in loaded_text should_be_binary = file_extension in binary_files_extensions - assert should_be_binary == is_file_binary_fn(created_file_path) + assert should_be_binary == is_file_binary_fn(file) + + created_file_path.unlink() # cleanup |