refactor(agent/file_operations): Refactor file opening/reading and parsing

- Update the signature of `FileWorkspace.open_file` and fix implementations in every workspace backend - Replace `open()` with `workspace.open_file` in the `read_file` command to use the workspace's file opening functionality - Fix the parametrization of the `test_text_file_parsers` test to correctly test text file parsers
author: Reinier van der Leer <pwuts@agpt.co> 2023-12-12 17:41:04 +0100
committer: Reinier van der Leer <pwuts@agpt.co> 2023-12-12 17:41:55 +0100
commit: d95e3b5b54bcac00541bbac1e69644c73970b72f (patch)
tree: f5df5cd23a06761af0706b03ab1b302e7b942547
parent: fix(agent/file_operations): Fix path processing in file_operations.py and acr... (diff)
download: Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.gz
Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.bz2
Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.zip
8 files changed, 124 insertions, 110 deletions
diff --git a/autogpts/autogpt/autogpt/commands/file_operations.py b/autogpts/autogpt/autogpt/commands/file_operations.py
index 6f4986105..9d70b3874 100644
--- a/autogpts/autogpt/autogpt/commands/file_operations.py
+++ b/autogpts/autogpt/autogpt/commands/file_operations.py
@@ -16,7 +16,7 @@ from autogpt.core.utils.json_schema import JSONSchema
 from autogpt.memory.vector import MemoryItemFactory, VectorMemory
 
 from .decorators import sanitize_path_arg
-from .file_operations_utils import read_textual_file
+from .file_operations_utils import decode_textual_file
 
 COMMAND_CATEGORY = "file_operations"
 COMMAND_CATEGORY_TITLE = "File Operations"
@@ -140,8 +140,7 @@ def log_operation(
         )
     },
 )
-@sanitize_path_arg("filename")
-def read_file(filename: Path, agent: Agent) -> str:
+def read_file(filename: str | Path, agent: Agent) -> str:
     """Read a file and return the contents
 
     Args:
@@ -150,8 +149,8 @@ def read_file(filename: Path, agent: Agent) -> str:
     Returns:
         str: The contents of the file
     """
-    content = read_textual_file(filename, logger)
-    # TODO: content = agent.workspace.read_file(filename)
+    file = agent.workspace.open_file(filename, binary=True)
+    content = decode_textual_file(file, logger)
 
     # # TODO: invalidate/update memory when file is edited
     # file_memory = MemoryItem.from_text_file(content, str(filename), agent.config)
diff --git a/autogpts/autogpt/autogpt/commands/file_operations_utils.py b/autogpts/autogpt/autogpt/commands/file_operations_utils.py
index 6b24a6f11..e7c001e93 100644
--- a/autogpts/autogpt/autogpt/commands/file_operations_utils.py
+++ b/autogpts/autogpt/autogpt/commands/file_operations_utils.py
@@ -1,11 +1,11 @@
 import json
 import logging
 import os
-from pathlib import Path
+from abc import ABC, abstractmethod
+from typing import BinaryIO
 
 import charset_normalizer
 import docx
-import markdown
 import pypdf
 import yaml
 from bs4 import BeautifulSoup
@@ -14,23 +14,24 @@ from pylatexenc.latex2text import LatexNodes2Text
 logger = logging.getLogger(__name__)
 
 
-class ParserStrategy:
-    def read(self, file_path: Path) -> str:
-        raise NotImplementedError
+class ParserStrategy(ABC):
+    @abstractmethod
+    def read(self, file: BinaryIO) -> str:
+        ...
 
 
 # Basic text file reading
 class TXTParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        charset_match = charset_normalizer.from_path(file_path).best()
-        logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'")
+    def read(self, file: BinaryIO) -> str:
+        charset_match = charset_normalizer.from_bytes(file.read()).best()
+        logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'")
         return str(charset_match)
 
 
 # Reading text from binary file using pdf parser
 class PDFParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        parser = pypdf.PdfReader(file_path)
+    def read(self, file: BinaryIO) -> str:
+        parser = pypdf.PdfReader(file)
         text = ""
         for page_idx in range(len(parser.pages)):
             text += parser.pages[page_idx].extract_text()
@@ -39,8 +40,8 @@ class PDFParser(ParserStrategy):
 
 # Reading text from binary file using docs parser
 class DOCXParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        doc_file = docx.Document(file_path)
+    def read(self, file: BinaryIO) -> str:
+        doc_file = docx.Document(file)
         text = ""
         for para in doc_file.paragraphs:
             text += para.text
@@ -49,50 +50,37 @@ class DOCXParser(ParserStrategy):
 
 # Reading as dictionary and returning string format
 class JSONParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            data = json.load(f)
-            text = str(data)
+    def read(self, file: BinaryIO) -> str:
+        data = json.load(file)
+        text = str(data)
         return text
 
 
 class XMLParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            soup = BeautifulSoup(f, "xml")
-            text = soup.get_text()
+    def read(self, file: BinaryIO) -> str:
+        soup = BeautifulSoup(file, "xml")
+        text = soup.get_text()
         return text
 
 
 # Reading as dictionary and returning string format
 class YAMLParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            data = yaml.load(f, Loader=yaml.FullLoader)
-            text = str(data)
+    def read(self, file: BinaryIO) -> str:
+        data = yaml.load(file, Loader=yaml.FullLoader)
+        text = str(data)
         return text
 
 
 class HTMLParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            soup = BeautifulSoup(f, "html.parser")
-            text = soup.get_text()
-        return text
-
-
-class MarkdownParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            html = markdown.markdown(f.read())
-            text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
+    def read(self, file: BinaryIO) -> str:
+        soup = BeautifulSoup(file, "html.parser")
+        text = soup.get_text()
         return text
 
 
 class LaTeXParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            latex = f.read()
+    def read(self, file: BinaryIO) -> str:
+        latex = file.read().decode()
         text = LatexNodes2Text().latex_to_text(latex)
         return text
 
@@ -106,13 +94,15 @@ class FileContext:
         self.logger.debug(f"Setting Context Parser to {parser}")
         self.parser = parser
 
-    def read_file(self, file_path) -> str:
-        self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
-        return self.parser.read(file_path)
+    def decode_file(self, file: BinaryIO) -> str:
+        self.logger.debug(f"Reading file {file.name} with parser {self.parser}")
+        return self.parser.read(file)
 
 
 extension_to_parser = {
     ".txt": TXTParser(),
+    ".md": TXTParser(),
+    ".markdown": TXTParser(),
     ".csv": TXTParser(),
     ".pdf": PDFParser(),
     ".docx": DOCXParser(),
@@ -123,47 +113,36 @@ extension_to_parser = {
     ".html": HTMLParser(),
     ".htm": HTMLParser(),
     ".xhtml": HTMLParser(),
-    ".md": MarkdownParser(),
-    ".markdown": MarkdownParser(),
     ".tex": LaTeXParser(),
 }
 
 
-def is_file_binary_fn(file_path: Path):
+def is_file_binary_fn(file: BinaryIO):
     """Given a file path load all its content and checks if the null bytes is present
 
     Args:
-        file_path (_type_): _description_
+        file (_type_): _description_
 
     Returns:
         bool: is_binary
     """
-    with open(file_path, "rb") as f:
-        file_data = f.read()
+    file_data = file.read()
+    file.seek(0)
     if b"\x00" in file_data:
         return True
     return False
 
 
-def read_textual_file(file_path: Path, logger: logging.Logger) -> str:
-    if not file_path.is_absolute():
-        raise ValueError("File path must be absolute")
-
-    if not file_path.is_file():
-        if not file_path.exists():
-            raise FileNotFoundError(
-                f"read_file {file_path} failed: no such file or directory"
-            )
-        else:
-            raise ValueError(f"read_file failed: {file_path} is not a file")
+def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str:
+    if not file.readable():
+        raise ValueError(f"read_file failed: {file.name} is not a file")
 
-    is_binary = is_file_binary_fn(file_path)
-    file_extension = os.path.splitext(file_path)[1].lower()
+    file_extension = os.path.splitext(file.name)[1].lower()
     parser = extension_to_parser.get(file_extension)
     if not parser:
-        if is_binary:
+        if is_file_binary_fn(file):
             raise ValueError(f"Unsupported binary file format: {file_extension}")
         # fallback to txt file parser (to support script and code files loading)
         parser = TXTParser()
     file_context = FileContext(parser, logger)
-    return file_context.read_file(file_path)
+    return file_context.decode_file(file)
diff --git a/autogpts/autogpt/autogpt/file_workspace/base.py b/autogpts/autogpt/autogpt/file_workspace/base.py
index 2e4f43475..865b34dd5 100644
--- a/autogpts/autogpt/autogpt/file_workspace/base.py
+++ b/autogpts/autogpt/autogpt/file_workspace/base.py
@@ -5,8 +5,9 @@ from __future__ import annotations
 
 import logging
 from abc import ABC, abstractmethod
+from io import IOBase, TextIOBase
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, overload
+from typing import IO, Any, BinaryIO, Callable, Literal, Optional, TextIO, overload
 
 from autogpt.core.configuration.schema import SystemConfiguration
 
@@ -47,9 +48,23 @@ class FileWorkspace(ABC):
         doesn't exist yet. E.g. a folder on disk, or an S3 Bucket.
         """
 
+    @overload
+    @abstractmethod
+    def open_file(
+        self, path: str | Path, binary: Literal[False] = False
+    ) -> TextIO | TextIOBase:
+        """Returns a readable text file-like object representing the file."""
+
+    @overload
+    @abstractmethod
+    def open_file(
+        self, path: str | Path, binary: Literal[True] = True
+    ) -> BinaryIO | IOBase:
+        """Returns a readable binary file-like object representing the file."""
+
     @abstractmethod
-    def open_file(self, path: str | Path, mode: str = "r"):
-        """Open a file in the workspace."""
+    def open_file(self, path: str | Path, binary: bool = False) -> IO | IOBase:
+        """Returns a readable file-like object representing the file."""
 
     @overload
     @abstractmethod
diff --git a/autogpts/autogpt/autogpt/file_workspace/gcs.py b/autogpts/autogpt/autogpt/file_workspace/gcs.py
index bca410451..3a6b9772c 100644
--- a/autogpts/autogpt/autogpt/file_workspace/gcs.py
+++ b/autogpts/autogpt/autogpt/file_workspace/gcs.py
@@ -6,6 +6,7 @@ from __future__ import annotations
 
 import inspect
 import logging
+from io import IOBase
 from pathlib import Path
 
 from google.cloud import storage
@@ -40,7 +41,7 @@ class GCSFileWorkspace(FileWorkspace):
         return self._root
 
     @property
-    def restrict_to_root(self):
+    def restrict_to_root(self) -> bool:
         """Whether to restrict generated paths to the root."""
         return True
 
@@ -50,26 +51,28 @@ class GCSFileWorkspace(FileWorkspace):
     def get_path(self, relative_path: str | Path) -> Path:
         return super().get_path(relative_path).relative_to("/")
 
-    def open_file(self, path: str | Path, mode: str = "r"):
-        """Open a file in the workspace."""
+    def _get_blob(self, path: str | Path) -> storage.Blob:
         path = self.get_path(path)
-        blob = self._bucket.blob(str(path))
-        return blob
+        return self._bucket.blob(str(path))
+
+    def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
+        """Open a file in the workspace."""
+        blob = self._get_blob(path)
+        blob.reload()  # pin revision number to prevent version mixing while reading
+        return blob.open("rb" if binary else "r")
 
     def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
         """Read a file in the workspace."""
-        blob = self.open_file(path, "r")
-        file_content = (
-            blob.download_as_text() if not binary else blob.download_as_bytes()
-        )
-        return file_content
+        return self.open_file(path, binary).read()
 
-    async def write_file(self, path: str | Path, content: str | bytes):
+    async def write_file(self, path: str | Path, content: str | bytes) -> None:
         """Write to a file in the workspace."""
-        blob = self.open_file(path, "w")
-        blob.upload_from_string(content) if isinstance(
-            content, str
-        ) else blob.upload_from_file(content)
+        blob = self._get_blob(path)
+
+        if isinstance(content, str):
+            blob.upload_from_string(content)
+        else:
+            blob.upload_from_file(content)
 
         if self.on_write_file:
             path = Path(path)
diff --git a/autogpts/autogpt/autogpt/file_workspace/local.py b/autogpts/autogpt/autogpt/file_workspace/local.py
index 347e29667..8c2aa6521 100644
--- a/autogpts/autogpt/autogpt/file_workspace/local.py
+++ b/autogpts/autogpt/autogpt/file_workspace/local.py
@@ -6,6 +6,7 @@ from __future__ import annotations
 import inspect
 import logging
 from pathlib import Path
+from typing import IO
 
 from .base import FileWorkspace, FileWorkspaceConfiguration
 
@@ -26,26 +27,29 @@ class LocalFileWorkspace(FileWorkspace):
         return self._root
 
     @property
-    def restrict_to_root(self):
+    def restrict_to_root(self) -> bool:
         """Whether to restrict generated paths to the root."""
         return self._restrict_to_root
 
     def initialize(self) -> None:
         self.root.mkdir(exist_ok=True, parents=True)
 
-    def open_file(self, path: str | Path, mode: str = "r"):
+    def open_file(self, path: str | Path, binary: bool = False) -> IO:
         """Open a file in the workspace."""
+        return self._open_file(path, "rb" if binary else "r")
+
+    def _open_file(self, path: str | Path, mode: str = "r") -> IO:
         full_path = self.get_path(path)
-        return open(full_path, mode)
+        return open(full_path, mode)  # type: ignore
 
-    def read_file(self, path: str | Path, binary: bool = False):
+    def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
         """Read a file in the workspace."""
-        with self.open_file(path, "rb" if binary else "r") as file:
+        with self._open_file(path, "rb" if binary else "r") as file:
             return file.read()
 
-    async def write_file(self, path: str | Path, content: str | bytes):
+    async def write_file(self, path: str | Path, content: str | bytes) -> None:
         """Write to a file in the workspace."""
-        with self.open_file(path, "wb" if type(content) is bytes else "w") as file:
+        with self._open_file(path, "wb" if type(content) is bytes else "w") as file:
             file.write(content)
 
         if self.on_write_file:
@@ -61,7 +65,7 @@ class LocalFileWorkspace(FileWorkspace):
         path = self.get_path(path)
         return [file.relative_to(path) for file in path.rglob("*") if file.is_file()]
 
-    def delete_file(self, path: str | Path):
+    def delete_file(self, path: str | Path) -> None:
         """Delete a file in the workspace."""
         full_path = self.get_path(path)
         full_path.unlink()
diff --git a/autogpts/autogpt/autogpt/file_workspace/s3.py b/autogpts/autogpt/autogpt/file_workspace/s3.py
index d52d9388f..79f3ae74e 100644
--- a/autogpts/autogpt/autogpt/file_workspace/s3.py
+++ b/autogpts/autogpt/autogpt/file_workspace/s3.py
@@ -8,6 +8,7 @@ import contextlib
 import inspect
 import logging
 import os
+from io import IOBase, TextIOWrapper
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional
 
@@ -74,22 +75,27 @@ class S3FileWorkspace(FileWorkspace):
     def get_path(self, relative_path: str | Path) -> Path:
         return super().get_path(relative_path).relative_to("/")
 
-    def open_file(self, path: str | Path, mode: str = "r"):
-        """Open a file in the workspace."""
+    def _get_obj(self, path: str | Path) -> mypy_boto3_s3.service_resource.Object:
+        """Get an S3 object."""
         path = self.get_path(path)
         obj = self._bucket.Object(str(path))
         with contextlib.suppress(botocore.exceptions.ClientError):
             obj.load()
         return obj
 
+    def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
+        """Open a file in the workspace."""
+        obj = self._get_obj(path)
+        return obj.get()["Body"] if not binary else TextIOWrapper(obj.get()["Body"])
+
     def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
         """Read a file in the workspace."""
-        file_content = self.open_file(path, "r").get()["Body"].read()
+        file_content = self.open_file(path, binary).read()
         return file_content if binary else file_content.decode()
 
-    async def write_file(self, path: str | Path, content: str | bytes):
+    async def write_file(self, path: str | Path, content: str | bytes) -> None:
         """Write to a file in the workspace."""
-        obj = self.open_file(path, "w")
+        obj = self._get_obj(path)
         obj.put(Body=content)
 
         if self.on_write_file:
diff --git a/autogpts/autogpt/autogpt/models/context_item.py b/autogpts/autogpt/autogpt/models/context_item.py
index 17a2a0b8e..0e5d9a373 100644
--- a/autogpts/autogpt/autogpt/models/context_item.py
+++ b/autogpts/autogpt/autogpt/models/context_item.py
@@ -5,7 +5,7 @@ from typing import Optional
 
 from pydantic import BaseModel, Field
 
-from autogpt.commands.file_operations_utils import read_textual_file
+from autogpt.commands.file_operations_utils import decode_textual_file
 
 logger = logging.getLogger(__name__)
 
@@ -56,7 +56,8 @@ class FileContextItem(BaseModel, ContextItem):
 
     @property
     def content(self) -> str:
-        return read_textual_file(self.file_path, logger)
+        with open(self.file_path, "rb") as file:
+            return decode_textual_file(file, logger)
 
 
 class FolderContextItem(BaseModel, ContextItem):
diff --git a/autogpts/autogpt/tests/unit/test_text_file_parsers.py b/autogpts/autogpt/tests/unit/test_text_file_parsers.py
index 3b1231489..d17de9c8c 100644
--- a/autogpts/autogpt/tests/unit/test_text_file_parsers.py
+++ b/autogpts/autogpt/tests/unit/test_text_file_parsers.py
@@ -5,10 +5,14 @@ from pathlib import Path
 from xml.etree import ElementTree
 
 import docx
+import pytest
 import yaml
 from bs4 import BeautifulSoup
 
-from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file
+from autogpt.commands.file_operations_utils import (
+    decode_textual_file,
+    is_file_binary_fn,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -148,15 +152,18 @@ respective_file_creation_functions = {
 binary_files_extensions = [".pdf", ".docx"]
 
 
-def test_parsers():
-    for (
-        file_extension,
-        c_file_creator,
-    ) in respective_file_creation_functions.items():
-        created_file_path = Path(c_file_creator())
-        loaded_text = read_textual_file(created_file_path, logger)
+@pytest.mark.parametrize(
+    "file_extension, c_file_creator",
+    respective_file_creation_functions.items(),
+)
+def test_parsers(file_extension, c_file_creator):
+    created_file_path = Path(c_file_creator())
+    with open(created_file_path, "rb") as file:
+        loaded_text = decode_textual_file(file, logger)
 
         assert plain_text_str in loaded_text
 
         should_be_binary = file_extension in binary_files_extensions
-        assert should_be_binary == is_file_binary_fn(created_file_path)
+        assert should_be_binary == is_file_binary_fn(file)
+
+    created_file_path.unlink()  # cleanup
author	Reinier van der Leer <pwuts@agpt.co>	2023-12-12 17:41:04 +0100
committer	Reinier van der Leer <pwuts@agpt.co>	2023-12-12 17:41:55 +0100
commit	d95e3b5b54bcac00541bbac1e69644c73970b72f (patch)
tree	f5df5cd23a06761af0706b03ab1b302e7b942547
parent	fix(agent/file_operations): Fix path processing in file_operations.py and acr... (diff)
download	Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.gz Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.bz2 Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.zip