aboutsummaryrefslogtreecommitdiff
path: root/autogpts/autogpt/tests/unit/test_text_file_parsers.py
blob: ae2ad3ce5cef7459d1f8f5fbdb656db0427b46da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import json
import logging
import tempfile
from pathlib import Path
from xml.etree import ElementTree

import docx
import yaml
from bs4 import BeautifulSoup

from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file

logger = logging.getLogger(__name__)

plain_text_str = "Hello, world!"


def mock_text_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
        f.write(plain_text_str)
    return f.name


def mock_csv_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") as f:
        f.write(plain_text_str)
    return f.name


def mock_pdf_file():
    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".pdf") as f:
        # Create a new PDF and add a page with the text plain_text_str
        # Write the PDF header
        f.write(b"%PDF-1.7\n")
        # Write the document catalog
        f.write(b"1 0 obj\n")
        f.write(b"<< /Type /Catalog /Pages 2 0 R >>\n")
        f.write(b"endobj\n")
        # Write the page object
        f.write(b"2 0 obj\n")
        f.write(
            b"<< /Type /Page /Parent 1 0 R /Resources << /Font << /F1 3 0 R >> >> /MediaBox [0 0 612 792] /Contents 4 0 R >>\n"
        )
        f.write(b"endobj\n")
        # Write the font object
        f.write(b"3 0 obj\n")
        f.write(
            b"<< /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica-Bold >>\n"
        )
        f.write(b"endobj\n")
        # Write the page contents object
        f.write(b"4 0 obj\n")
        f.write(b"<< /Length 25 >>\n")
        f.write(b"stream\n")
        f.write(b"BT\n/F1 12 Tf\n72 720 Td\n(Hello, world!) Tj\nET\n")
        f.write(b"endstream\n")
        f.write(b"endobj\n")
        # Write the cross-reference table
        f.write(b"xref\n")
        f.write(b"0 5\n")
        f.write(b"0000000000 65535 f \n")
        f.write(b"0000000017 00000 n \n")
        f.write(b"0000000073 00000 n \n")
        f.write(b"0000000123 00000 n \n")
        f.write(b"0000000271 00000 n \n")
        f.write(b"trailer\n")
        f.write(b"<< /Size 5 /Root 1 0 R >>\n")
        f.write(b"startxref\n")
        f.write(b"380\n")
        f.write(b"%%EOF\n")
        f.write(b"\x00")
    return f.name


def mock_docx_file():
    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".docx") as f:
        document = docx.Document()
        document.add_paragraph(plain_text_str)
        document.save(f.name)
    return f.name


def mock_json_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
        json.dump({"text": plain_text_str}, f)
    return f.name


def mock_xml_file():
    root = ElementTree.Element("text")
    root.text = plain_text_str
    tree = ElementTree.ElementTree(root)
    with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".xml") as f:
        tree.write(f)
    return f.name


def mock_yaml_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".yaml") as f:
        yaml.dump({"text": plain_text_str}, f)
    return f.name


def mock_html_file():
    html = BeautifulSoup(
        f"<html><head><title>This is a test</title></head><body><p>{plain_text_str}</p></body></html>",
        "html.parser",
    )
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".html") as f:
        f.write(str(html))
    return f.name


def mock_md_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as f:
        f.write(f"# {plain_text_str}!\n")
    return f.name


def mock_latex_file():
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tex") as f:
        latex_str = rf"\documentclass{{article}}\begin{{document}}{plain_text_str}\end{{document}}"
        f.write(latex_str)
    return f.name


respective_file_creation_functions = {
    ".txt": mock_text_file,
    ".csv": mock_csv_file,
    ".pdf": mock_pdf_file,
    ".docx": mock_docx_file,
    ".json": mock_json_file,
    ".xml": mock_xml_file,
    ".yaml": mock_yaml_file,
    ".html": mock_html_file,
    ".md": mock_md_file,
    ".tex": mock_latex_file,
}
binary_files_extensions = [".pdf", ".docx"]


def test_parsers():
    for (
        file_extension,
        c_file_creator,
    ) in respective_file_creation_functions.items():
        created_file_path = Path(c_file_creator())
        loaded_text = read_textual_file(created_file_path, logger)

        assert plain_text_str in loaded_text

        should_be_binary = file_extension in binary_files_extensions
        assert should_be_binary == is_file_binary_fn(created_file_path)