aboutsummaryrefslogtreecommitdiff
path: root/autogpt/commands/file_operations_utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'autogpt/commands/file_operations_utils.py')
-rw-r--r--autogpt/commands/file_operations_utils.py161
1 files changed, 0 insertions, 161 deletions
diff --git a/autogpt/commands/file_operations_utils.py b/autogpt/commands/file_operations_utils.py
deleted file mode 100644
index b00779688..000000000
--- a/autogpt/commands/file_operations_utils.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import json
-import os
-
-import charset_normalizer
-import docx
-import markdown
-import PyPDF2
-import yaml
-from bs4 import BeautifulSoup
-from pylatexenc.latex2text import LatexNodes2Text
-
-from autogpt import logs
-from autogpt.logs import logger
-
-
-class ParserStrategy:
- def read(self, file_path: str) -> str:
- raise NotImplementedError
-
-
-# Basic text file reading
-class TXTParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- charset_match = charset_normalizer.from_path(file_path).best()
- logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'")
- return str(charset_match)
-
-
-# Reading text from binary file using pdf parser
-class PDFParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- parser = PyPDF2.PdfReader(file_path)
- text = ""
- for page_idx in range(len(parser.pages)):
- text += parser.pages[page_idx].extract_text()
- return text
-
-
-# Reading text from binary file using docs parser
-class DOCXParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- doc_file = docx.Document(file_path)
- text = ""
- for para in doc_file.paragraphs:
- text += para.text
- return text
-
-
-# Reading as dictionary and returning string format
-class JSONParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- with open(file_path, "r") as f:
- data = json.load(f)
- text = str(data)
- return text
-
-
-class XMLParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- with open(file_path, "r") as f:
- soup = BeautifulSoup(f, "xml")
- text = soup.get_text()
- return text
-
-
-# Reading as dictionary and returning string format
-class YAMLParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- with open(file_path, "r") as f:
- data = yaml.load(f, Loader=yaml.FullLoader)
- text = str(data)
- return text
-
-
-class HTMLParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- with open(file_path, "r") as f:
- soup = BeautifulSoup(f, "html.parser")
- text = soup.get_text()
- return text
-
-
-class MarkdownParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- with open(file_path, "r") as f:
- html = markdown.markdown(f.read())
- text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
- return text
-
-
-class LaTeXParser(ParserStrategy):
- def read(self, file_path: str) -> str:
- with open(file_path, "r") as f:
- latex = f.read()
- text = LatexNodes2Text().latex_to_text(latex)
- return text
-
-
-class FileContext:
- def __init__(self, parser: ParserStrategy, logger: logs.Logger):
- self.parser = parser
- self.logger = logger
-
- def set_parser(self, parser: ParserStrategy) -> None:
- self.logger.debug(f"Setting Context Parser to {parser}")
- self.parser = parser
-
- def read_file(self, file_path) -> str:
- self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
- return self.parser.read(file_path)
-
-
-extension_to_parser = {
- ".txt": TXTParser(),
- ".csv": TXTParser(),
- ".pdf": PDFParser(),
- ".docx": DOCXParser(),
- ".json": JSONParser(),
- ".xml": XMLParser(),
- ".yaml": YAMLParser(),
- ".yml": YAMLParser(),
- ".html": HTMLParser(),
- ".htm": HTMLParser(),
- ".xhtml": HTMLParser(),
- ".md": MarkdownParser(),
- ".markdown": MarkdownParser(),
- ".tex": LaTeXParser(),
-}
-
-
-def is_file_binary_fn(file_path: str):
- """Given a file path load all its content and checks if the null bytes is present
-
- Args:
- file_path (_type_): _description_
-
- Returns:
- bool: is_binary
- """
- with open(file_path, "rb") as f:
- file_data = f.read()
- if b"\x00" in file_data:
- return True
- return False
-
-
-def read_textual_file(file_path: str, logger: logs.Logger) -> str:
- if not os.path.isfile(file_path):
- raise FileNotFoundError(
- f"read_file {file_path} failed: no such file or directory"
- )
- is_binary = is_file_binary_fn(file_path)
- file_extension = os.path.splitext(file_path)[1].lower()
- parser = extension_to_parser.get(file_extension)
- if not parser:
- if is_binary:
- raise ValueError(f"Unsupported binary file format: {file_extension}")
- # fallback to txt file parser (to support script and code files loading)
- parser = TXTParser()
- file_context = FileContext(parser, logger)
- return file_context.read_file(file_path)