aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Reinier van der Leer <pwuts@agpt.co> 2023-12-12 17:41:04 +0100
committerGravatar Reinier van der Leer <pwuts@agpt.co> 2023-12-12 17:41:55 +0100
commitd95e3b5b54bcac00541bbac1e69644c73970b72f (patch)
treef5df5cd23a06761af0706b03ab1b302e7b942547
parentfix(agent/file_operations): Fix path processing in file_operations.py and acr... (diff)
downloadAuto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.gz
Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.tar.bz2
Auto-GPT-d95e3b5b54bcac00541bbac1e69644c73970b72f.zip
refactor(agent/file_operations): Refactor file opening/reading and parsing
- Update the signature of `FileWorkspace.open_file` and fix implementations in every workspace backend - Replace `open()` with `workspace.open_file` in the `read_file` command to use the workspace's file opening functionality - Fix the parametrization of the `test_text_file_parsers` test to correctly test text file parsers
-rw-r--r--autogpts/autogpt/autogpt/commands/file_operations.py9
-rw-r--r--autogpts/autogpt/autogpt/commands/file_operations_utils.py105
-rw-r--r--autogpts/autogpt/autogpt/file_workspace/base.py21
-rw-r--r--autogpts/autogpt/autogpt/file_workspace/gcs.py33
-rw-r--r--autogpts/autogpt/autogpt/file_workspace/local.py20
-rw-r--r--autogpts/autogpt/autogpt/file_workspace/s3.py16
-rw-r--r--autogpts/autogpt/autogpt/models/context_item.py5
-rw-r--r--autogpts/autogpt/tests/unit/test_text_file_parsers.py25
8 files changed, 124 insertions, 110 deletions
diff --git a/autogpts/autogpt/autogpt/commands/file_operations.py b/autogpts/autogpt/autogpt/commands/file_operations.py
index 6f4986105..9d70b3874 100644
--- a/autogpts/autogpt/autogpt/commands/file_operations.py
+++ b/autogpts/autogpt/autogpt/commands/file_operations.py
@@ -16,7 +16,7 @@ from autogpt.core.utils.json_schema import JSONSchema
from autogpt.memory.vector import MemoryItemFactory, VectorMemory
from .decorators import sanitize_path_arg
-from .file_operations_utils import read_textual_file
+from .file_operations_utils import decode_textual_file
COMMAND_CATEGORY = "file_operations"
COMMAND_CATEGORY_TITLE = "File Operations"
@@ -140,8 +140,7 @@ def log_operation(
)
},
)
-@sanitize_path_arg("filename")
-def read_file(filename: Path, agent: Agent) -> str:
+def read_file(filename: str | Path, agent: Agent) -> str:
"""Read a file and return the contents
Args:
@@ -150,8 +149,8 @@ def read_file(filename: Path, agent: Agent) -> str:
Returns:
str: The contents of the file
"""
- content = read_textual_file(filename, logger)
- # TODO: content = agent.workspace.read_file(filename)
+ file = agent.workspace.open_file(filename, binary=True)
+ content = decode_textual_file(file, logger)
# # TODO: invalidate/update memory when file is edited
# file_memory = MemoryItem.from_text_file(content, str(filename), agent.config)
diff --git a/autogpts/autogpt/autogpt/commands/file_operations_utils.py b/autogpts/autogpt/autogpt/commands/file_operations_utils.py
index 6b24a6f11..e7c001e93 100644
--- a/autogpts/autogpt/autogpt/commands/file_operations_utils.py
+++ b/autogpts/autogpt/autogpt/commands/file_operations_utils.py
@@ -1,11 +1,11 @@
import json
import logging
import os
-from pathlib import Path
+from abc import ABC, abstractmethod
+from typing import BinaryIO
import charset_normalizer
import docx
-import markdown
import pypdf
import yaml
from bs4 import BeautifulSoup
@@ -14,23 +14,24 @@ from pylatexenc.latex2text import LatexNodes2Text
logger = logging.getLogger(__name__)
-class ParserStrategy:
- def read(self, file_path: Path) -> str:
- raise NotImplementedError
+class ParserStrategy(ABC):
+ @abstractmethod
+ def read(self, file: BinaryIO) -> str:
+ ...
# Basic text file reading
class TXTParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- charset_match = charset_normalizer.from_path(file_path).best()
- logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'")
+ def read(self, file: BinaryIO) -> str:
+ charset_match = charset_normalizer.from_bytes(file.read()).best()
+ logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'")
return str(charset_match)
# Reading text from binary file using pdf parser
class PDFParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- parser = pypdf.PdfReader(file_path)
+ def read(self, file: BinaryIO) -> str:
+ parser = pypdf.PdfReader(file)
text = ""
for page_idx in range(len(parser.pages)):
text += parser.pages[page_idx].extract_text()
@@ -39,8 +40,8 @@ class PDFParser(ParserStrategy):
# Reading text from binary file using docs parser
class DOCXParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- doc_file = docx.Document(file_path)
+ def read(self, file: BinaryIO) -> str:
+ doc_file = docx.Document(file)
text = ""
for para in doc_file.paragraphs:
text += para.text
@@ -49,50 +50,37 @@ class DOCXParser(ParserStrategy):
# Reading as dictionary and returning string format
class JSONParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- with open(file_path, "r") as f:
- data = json.load(f)
- text = str(data)
+ def read(self, file: BinaryIO) -> str:
+ data = json.load(file)
+ text = str(data)
return text
class XMLParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- with open(file_path, "r") as f:
- soup = BeautifulSoup(f, "xml")
- text = soup.get_text()
+ def read(self, file: BinaryIO) -> str:
+ soup = BeautifulSoup(file, "xml")
+ text = soup.get_text()
return text
# Reading as dictionary and returning string format
class YAMLParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- with open(file_path, "r") as f:
- data = yaml.load(f, Loader=yaml.FullLoader)
- text = str(data)
+ def read(self, file: BinaryIO) -> str:
+ data = yaml.load(file, Loader=yaml.FullLoader)
+ text = str(data)
return text
class HTMLParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- with open(file_path, "r") as f:
- soup = BeautifulSoup(f, "html.parser")
- text = soup.get_text()
- return text
-
-
-class MarkdownParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- with open(file_path, "r") as f:
- html = markdown.markdown(f.read())
- text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
+ def read(self, file: BinaryIO) -> str:
+ soup = BeautifulSoup(file, "html.parser")
+ text = soup.get_text()
return text
class LaTeXParser(ParserStrategy):
- def read(self, file_path: Path) -> str:
- with open(file_path, "r") as f:
- latex = f.read()
+ def read(self, file: BinaryIO) -> str:
+ latex = file.read().decode()
text = LatexNodes2Text().latex_to_text(latex)
return text
@@ -106,13 +94,15 @@ class FileContext:
self.logger.debug(f"Setting Context Parser to {parser}")
self.parser = parser
- def read_file(self, file_path) -> str:
- self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
- return self.parser.read(file_path)
+ def decode_file(self, file: BinaryIO) -> str:
+ self.logger.debug(f"Reading file {file.name} with parser {self.parser}")
+ return self.parser.read(file)
extension_to_parser = {
".txt": TXTParser(),
+ ".md": TXTParser(),
+ ".markdown": TXTParser(),
".csv": TXTParser(),
".pdf": PDFParser(),
".docx": DOCXParser(),
@@ -123,47 +113,36 @@ extension_to_parser = {
".html": HTMLParser(),
".htm": HTMLParser(),
".xhtml": HTMLParser(),
- ".md": MarkdownParser(),
- ".markdown": MarkdownParser(),
".tex": LaTeXParser(),
}
-def is_file_binary_fn(file_path: Path):
+def is_file_binary_fn(file: BinaryIO):
"""Given a file path load all its content and checks if the null bytes is present
Args:
- file_path (_type_): _description_
+ file (_type_): _description_
Returns:
bool: is_binary
"""
- with open(file_path, "rb") as f:
- file_data = f.read()
+ file_data = file.read()
+ file.seek(0)
if b"\x00" in file_data:
return True
return False
-def read_textual_file(file_path: Path, logger: logging.Logger) -> str:
- if not file_path.is_absolute():
- raise ValueError("File path must be absolute")
-
- if not file_path.is_file():
- if not file_path.exists():
- raise FileNotFoundError(
- f"read_file {file_path} failed: no such file or directory"
- )
- else:
- raise ValueError(f"read_file failed: {file_path} is not a file")
+def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str:
+ if not file.readable():
+ raise ValueError(f"read_file failed: {file.name} is not a file")
- is_binary = is_file_binary_fn(file_path)
- file_extension = os.path.splitext(file_path)[1].lower()
+ file_extension = os.path.splitext(file.name)[1].lower()
parser = extension_to_parser.get(file_extension)
if not parser:
- if is_binary:
+ if is_file_binary_fn(file):
raise ValueError(f"Unsupported binary file format: {file_extension}")
# fallback to txt file parser (to support script and code files loading)
parser = TXTParser()
file_context = FileContext(parser, logger)
- return file_context.read_file(file_path)
+ return file_context.decode_file(file)
diff --git a/autogpts/autogpt/autogpt/file_workspace/base.py b/autogpts/autogpt/autogpt/file_workspace/base.py
index 2e4f43475..865b34dd5 100644
--- a/autogpts/autogpt/autogpt/file_workspace/base.py
+++ b/autogpts/autogpt/autogpt/file_workspace/base.py
@@ -5,8 +5,9 @@ from __future__ import annotations
import logging
from abc import ABC, abstractmethod
+from io import IOBase, TextIOBase
from pathlib import Path
-from typing import Any, Callable, Literal, Optional, overload
+from typing import IO, Any, BinaryIO, Callable, Literal, Optional, TextIO, overload
from autogpt.core.configuration.schema import SystemConfiguration
@@ -47,9 +48,23 @@ class FileWorkspace(ABC):
doesn't exist yet. E.g. a folder on disk, or an S3 Bucket.
"""
+ @overload
+ @abstractmethod
+ def open_file(
+ self, path: str | Path, binary: Literal[False] = False
+ ) -> TextIO | TextIOBase:
+ """Returns a readable text file-like object representing the file."""
+
+ @overload
+ @abstractmethod
+ def open_file(
+ self, path: str | Path, binary: Literal[True] = True
+ ) -> BinaryIO | IOBase:
+ """Returns a readable binary file-like object representing the file."""
+
@abstractmethod
- def open_file(self, path: str | Path, mode: str = "r"):
- """Open a file in the workspace."""
+ def open_file(self, path: str | Path, binary: bool = False) -> IO | IOBase:
+ """Returns a readable file-like object representing the file."""
@overload
@abstractmethod
diff --git a/autogpts/autogpt/autogpt/file_workspace/gcs.py b/autogpts/autogpt/autogpt/file_workspace/gcs.py
index bca410451..3a6b9772c 100644
--- a/autogpts/autogpt/autogpt/file_workspace/gcs.py
+++ b/autogpts/autogpt/autogpt/file_workspace/gcs.py
@@ -6,6 +6,7 @@ from __future__ import annotations
import inspect
import logging
+from io import IOBase
from pathlib import Path
from google.cloud import storage
@@ -40,7 +41,7 @@ class GCSFileWorkspace(FileWorkspace):
return self._root
@property
- def restrict_to_root(self):
+ def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root."""
return True
@@ -50,26 +51,28 @@ class GCSFileWorkspace(FileWorkspace):
def get_path(self, relative_path: str | Path) -> Path:
return super().get_path(relative_path).relative_to("/")
- def open_file(self, path: str | Path, mode: str = "r"):
- """Open a file in the workspace."""
+ def _get_blob(self, path: str | Path) -> storage.Blob:
path = self.get_path(path)
- blob = self._bucket.blob(str(path))
- return blob
+ return self._bucket.blob(str(path))
+
+ def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
+ """Open a file in the workspace."""
+ blob = self._get_blob(path)
+ blob.reload() # pin revision number to prevent version mixing while reading
+ return blob.open("rb" if binary else "r")
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
- blob = self.open_file(path, "r")
- file_content = (
- blob.download_as_text() if not binary else blob.download_as_bytes()
- )
- return file_content
+ return self.open_file(path, binary).read()
- async def write_file(self, path: str | Path, content: str | bytes):
+ async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
- blob = self.open_file(path, "w")
- blob.upload_from_string(content) if isinstance(
- content, str
- ) else blob.upload_from_file(content)
+ blob = self._get_blob(path)
+
+ if isinstance(content, str):
+ blob.upload_from_string(content)
+ else:
+ blob.upload_from_file(content)
if self.on_write_file:
path = Path(path)
diff --git a/autogpts/autogpt/autogpt/file_workspace/local.py b/autogpts/autogpt/autogpt/file_workspace/local.py
index 347e29667..8c2aa6521 100644
--- a/autogpts/autogpt/autogpt/file_workspace/local.py
+++ b/autogpts/autogpt/autogpt/file_workspace/local.py
@@ -6,6 +6,7 @@ from __future__ import annotations
import inspect
import logging
from pathlib import Path
+from typing import IO
from .base import FileWorkspace, FileWorkspaceConfiguration
@@ -26,26 +27,29 @@ class LocalFileWorkspace(FileWorkspace):
return self._root
@property
- def restrict_to_root(self):
+ def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root."""
return self._restrict_to_root
def initialize(self) -> None:
self.root.mkdir(exist_ok=True, parents=True)
- def open_file(self, path: str | Path, mode: str = "r"):
+ def open_file(self, path: str | Path, binary: bool = False) -> IO:
"""Open a file in the workspace."""
+ return self._open_file(path, "rb" if binary else "r")
+
+ def _open_file(self, path: str | Path, mode: str = "r") -> IO:
full_path = self.get_path(path)
- return open(full_path, mode)
+ return open(full_path, mode) # type: ignore
- def read_file(self, path: str | Path, binary: bool = False):
+ def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
- with self.open_file(path, "rb" if binary else "r") as file:
+ with self._open_file(path, "rb" if binary else "r") as file:
return file.read()
- async def write_file(self, path: str | Path, content: str | bytes):
+ async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
- with self.open_file(path, "wb" if type(content) is bytes else "w") as file:
+ with self._open_file(path, "wb" if type(content) is bytes else "w") as file:
file.write(content)
if self.on_write_file:
@@ -61,7 +65,7 @@ class LocalFileWorkspace(FileWorkspace):
path = self.get_path(path)
return [file.relative_to(path) for file in path.rglob("*") if file.is_file()]
- def delete_file(self, path: str | Path):
+ def delete_file(self, path: str | Path) -> None:
"""Delete a file in the workspace."""
full_path = self.get_path(path)
full_path.unlink()
diff --git a/autogpts/autogpt/autogpt/file_workspace/s3.py b/autogpts/autogpt/autogpt/file_workspace/s3.py
index d52d9388f..79f3ae74e 100644
--- a/autogpts/autogpt/autogpt/file_workspace/s3.py
+++ b/autogpts/autogpt/autogpt/file_workspace/s3.py
@@ -8,6 +8,7 @@ import contextlib
import inspect
import logging
import os
+from io import IOBase, TextIOWrapper
from pathlib import Path
from typing import TYPE_CHECKING, Optional
@@ -74,22 +75,27 @@ class S3FileWorkspace(FileWorkspace):
def get_path(self, relative_path: str | Path) -> Path:
return super().get_path(relative_path).relative_to("/")
- def open_file(self, path: str | Path, mode: str = "r"):
- """Open a file in the workspace."""
+ def _get_obj(self, path: str | Path) -> mypy_boto3_s3.service_resource.Object:
+ """Get an S3 object."""
path = self.get_path(path)
obj = self._bucket.Object(str(path))
with contextlib.suppress(botocore.exceptions.ClientError):
obj.load()
return obj
+ def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
+ """Open a file in the workspace."""
+ obj = self._get_obj(path)
+ return obj.get()["Body"] if not binary else TextIOWrapper(obj.get()["Body"])
+
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
- file_content = self.open_file(path, "r").get()["Body"].read()
+ file_content = self.open_file(path, binary).read()
return file_content if binary else file_content.decode()
- async def write_file(self, path: str | Path, content: str | bytes):
+ async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
- obj = self.open_file(path, "w")
+ obj = self._get_obj(path)
obj.put(Body=content)
if self.on_write_file:
diff --git a/autogpts/autogpt/autogpt/models/context_item.py b/autogpts/autogpt/autogpt/models/context_item.py
index 17a2a0b8e..0e5d9a373 100644
--- a/autogpts/autogpt/autogpt/models/context_item.py
+++ b/autogpts/autogpt/autogpt/models/context_item.py
@@ -5,7 +5,7 @@ from typing import Optional
from pydantic import BaseModel, Field
-from autogpt.commands.file_operations_utils import read_textual_file
+from autogpt.commands.file_operations_utils import decode_textual_file
logger = logging.getLogger(__name__)
@@ -56,7 +56,8 @@ class FileContextItem(BaseModel, ContextItem):
@property
def content(self) -> str:
- return read_textual_file(self.file_path, logger)
+ with open(self.file_path, "rb") as file:
+ return decode_textual_file(file, logger)
class FolderContextItem(BaseModel, ContextItem):
diff --git a/autogpts/autogpt/tests/unit/test_text_file_parsers.py b/autogpts/autogpt/tests/unit/test_text_file_parsers.py
index 3b1231489..d17de9c8c 100644
--- a/autogpts/autogpt/tests/unit/test_text_file_parsers.py
+++ b/autogpts/autogpt/tests/unit/test_text_file_parsers.py
@@ -5,10 +5,14 @@ from pathlib import Path
from xml.etree import ElementTree
import docx
+import pytest
import yaml
from bs4 import BeautifulSoup
-from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file
+from autogpt.commands.file_operations_utils import (
+ decode_textual_file,
+ is_file_binary_fn,
+)
logger = logging.getLogger(__name__)
@@ -148,15 +152,18 @@ respective_file_creation_functions = {
binary_files_extensions = [".pdf", ".docx"]
-def test_parsers():
- for (
- file_extension,
- c_file_creator,
- ) in respective_file_creation_functions.items():
- created_file_path = Path(c_file_creator())
- loaded_text = read_textual_file(created_file_path, logger)
+@pytest.mark.parametrize(
+ "file_extension, c_file_creator",
+ respective_file_creation_functions.items(),
+)
+def test_parsers(file_extension, c_file_creator):
+ created_file_path = Path(c_file_creator())
+ with open(created_file_path, "rb") as file:
+ loaded_text = decode_textual_file(file, logger)
assert plain_text_str in loaded_text
should_be_binary = file_extension in binary_files_extensions
- assert should_be_binary == is_file_binary_fn(created_file_path)
+ assert should_be_binary == is_file_binary_fn(file)
+
+ created_file_path.unlink() # cleanup