benchmark/agbenchmark/conftest.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390

import contextlib
import json
import os
import shutil
import sys
import threading
import time
from pathlib import Path  # noqa
from typing import Any, Generator

import pytest

from agbenchmark.__main__ import TEMP_FOLDER_ABS_PATH
from agbenchmark.reports.reports import (
    finalize_reports,
    generate_single_call_report,
    session_finish,
)
from agbenchmark.utils.data_types import AgentBenchmarkConfig

GLOBAL_TIMEOUT = (
    1500  # The tests will stop after 25 minutes so we can send the reports.
)

pytest_plugins = ["agbenchmark.utils.dependencies"]
collect_ignore = ["challenges"]
suite_reports: dict[str, list] = {}


def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
    """
    This function loads the configuration for the agent benchmark from a given request.

    Args:
        request (Any): The request object from which the agent benchmark configuration is to be loaded.

    Returns:
        AgentBenchmarkConfig: The loaded agent benchmark configuration.

    Raises:
        json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
    """
    agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json"
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
            agent_benchmark_config.agent_benchmark_config_path = (
                agent_benchmark_config_path
            )
            return agent_benchmark_config
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise


@pytest.fixture(scope="module")
def config(request: Any) -> Any:
    """
    This pytest fixture is responsible for loading the agent benchmark configuration from a given request.
    This fixture is scoped to the module level, meaning it's invoked once per test module.

    Args:
        request (Any): The request object from which the agent benchmark configuration is to be loaded.

    Returns:
        Any: The loaded configuration dictionary.

    Raises:
        json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
    """
    config = {}
    agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json"
    try:
        with open(agent_benchmark_config_path, "r") as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
            agent_benchmark_config.agent_benchmark_config_path = (
                agent_benchmark_config_path
            )
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise

    config["AgentBenchmarkConfig"] = agent_benchmark_config

    return config


@pytest.fixture(autouse=True)
def temp_folder() -> Generator[str, None, None]:
    """
    This pytest fixture is responsible for setting up and tearing down the temporary folder for each test.
    It is automatically used in every test due to the 'autouse=True' parameter.
    It is used in order to let agbenchmark store files so they can then be evaluated.
    """

    # create output directory if it doesn't exist
    if not os.path.exists(TEMP_FOLDER_ABS_PATH):
        os.makedirs(TEMP_FOLDER_ABS_PATH, exist_ok=True)

    yield
    # teardown after test function completes
    if not os.getenv("KEEP_TEMP_FOLDER_FILES"):
        for filename in os.listdir(TEMP_FOLDER_ABS_PATH):
            file_path = os.path.join(TEMP_FOLDER_ABS_PATH, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f"Failed to delete {file_path}. Reason: {e}")


def pytest_addoption(parser: Any) -> None:
    """
    This function is a pytest hook that is called to add command-line options.
    It is used to add custom command-line options that are specific to the agent benchmark tests.
    These options can be used to control the behavior of the tests.
    The "--mock" option is used to run the tests in mock mode.
    The "--host" option is used to specify the host for the tests.
    The "--category" option is used to run only tests of a specific category.
    The "--nc" option is used to run the tests without caching.
    The "--cutoff" option is used to specify a cutoff time for the tests.
    The "--improve" option is used to run only the tests that are marked for improvement.
    The "--maintain" option is used to run only the tests that are marked for maintenance.
    The "--explore" option is used to run the tests in exploration mode.
    The "--test" option is used to run a specific test.
    The "--no_dep" option is used to run the tests without dependencies.
    The "--keep_answers" option is used to keep the answers of the tests.

    Args:
        parser (Any): The parser object to which the command-line options are added.
    """
    parser.addoption("--no_dep", action="store_true", default=False)
    parser.addoption("--mock", action="store_true", default=False)
    parser.addoption("--host", action="store_true", default=None)
    parser.addoption("--nc", action="store_true", default=False)
    parser.addoption("--cutoff", action="store_true", default=False)
    parser.addoption("--category", action="store_true", default=False)
    parser.addoption("--test", action="store_true", default=None)
    parser.addoption("--improve", action="store_true", default=False)
    parser.addoption("--maintain", action="store_true", default=False)
    parser.addoption("--explore", action="store_true", default=False)
    parser.addoption("--keep-answers", action="store_true", default=False)


@pytest.fixture(autouse=True)
def check_regression(request: Any) -> None:
    """
    This pytest fixture is responsible for checking if a test is a regression test.
    It is automatically used in every test due to the 'autouse=True' parameter.
    The test name and the agent benchmark configuration are retrieved from the request object.
    The regression reports are loaded from the path specified in the agent benchmark configuration.
    If the "--improve" option is used and the test name exists in the regression tests, the test is skipped.
    If the "--maintain" option is used and the test name does not exist in the regression tests, the test is also skipped.

    Args:
        request (Any): The request object from which the test name and the agent benchmark configuration are retrieved.
    """
    test_name = request.node.parent.name
    agent_benchmark_config = load_config_from_request(request)
    with contextlib.suppress(Exception):
        test = agent_benchmark_config.get_regression_reports_path()
        data = json.loads(test)
        challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")

        skip_string = f"Skipping {test_name} at {challenge_location}"

        # Check if the test name exists in the regression tests
        if request.config.getoption("--improve") and data.get(test_name, None):
            pytest.skip(f"{skip_string} because it's a regression test")
        elif request.config.getoption("--maintain") and not data.get(test_name, None):
            pytest.skip(f"{skip_string} because it's not a regression test")


# this is to get the challenge_data from every test
@pytest.fixture(autouse=True)
def challenge_data(request: Any) -> None:
    """
    This pytest fixture is responsible for providing the challenge data for each test.
    It is automatically used in every test due to the 'autouse=True' parameter.
    The challenge data is retrieved from the request object's parameters.
    This fixture is essential for the pytest system as it provides the necessary data for each test.

    Args:
        request (Any): The request object from which the challenge data is retrieved.

    Returns:
        None: The challenge data is directly passed to the test function and does not need to be returned.
    """
    return request.param


@pytest.fixture(autouse=True, scope="session")
def mock(request: Any) -> None:
    """
    This pytest fixture is responsible for retrieving the value of the "--mock" command-line option.
    It is automatically used in every test session due to the 'autouse=True' parameter and 'session' scope.
    The "--mock" option is used to run the tests in mock mode.
    This fixture is essential for the pytest system as it provides the necessary command-line option value for each test session.

    Args:
        request (Any): The request object from which the "--mock" option value is retrieved.

    Returns:
        None: The "--mock" option value is directly passed to the test session and does not need to be returned.
    """
    return request.config.getoption("--mock")


@pytest.fixture(autouse=True, scope="function")
def timer(request: Any) -> Any:
    """
    This pytest fixture is responsible for timing the execution of each test.
    It is automatically used in every test due to the 'autouse=True' parameter and 'function' scope.
    At the start of each test, it records the current time.
    After the test function completes, it calculates the run time and appends it to the test node's user properties.
    This allows the run time of each test to be accessed later for reporting or analysis.

    Args:
        request (Any): The request object from which the test node is retrieved.

    Yields:
        None: Control is yielded back to the test function.
    """
    start_time = time.time()
    yield
    run_time = time.time() - start_time
    request.node.user_properties.append(("run_time", run_time))


def pytest_runtest_makereport(item: Any, call: Any) -> None:
    """
    This function is a pytest hook that is called when a test report is being generated.
    It is used to generate and finalize reports for each test.

    Args:
        item (Any): The test item for which the report is being generated.
        call (Any): The call object from which the test result is retrieved.
    """
    challenge_data = item.funcargs.get("challenge_data", None)

    if not challenge_data:
        # this will only happen for dummy dependency setup tests
        return

    challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")

    flags = (
        "--test" in sys.argv
        or "--maintain" in sys.argv
        or "--improve" in sys.argv
        or "--explore" in sys.argv
    )

    if call.when == "call":
        answers = getattr(item, "answers", None)
        challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
        test_name = item.nodeid.split("::")[1]
        item.test_name = test_name

        generate_single_call_report(
            item, call, challenge_data, answers, challenge_location, test_name
        )

    if call.when == "teardown":
        finalize_reports(item, challenge_data)


def timeout_monitor(start_time: int) -> None:
    """
    This function is responsible for monitoring the total execution time of the test suite.
    It runs in a separate thread and checks every second if the total execution time has exceeded the global timeout.
    If the global timeout is exceeded, it terminates the pytest session with a specific return code.

    Args:
        start_time (int): The start time of the test suite.
    """
    while time.time() - start_time < GLOBAL_TIMEOUT:
        time.sleep(1)  # check every second

    pytest.exit("Test suite exceeded the global timeout", returncode=1)


def pytest_sessionstart(session: Any) -> None:
    """
    This function is a pytest hook that is called at the start of the test session.
    It starts the timeout monitor in a separate thread.
    The timeout monitor checks if the total execution time of the test suite has exceeded the global timeout.

    Args:
        session (Any): The pytest session object.
    """
    start_time = time.time()
    t = threading.Thread(target=timeout_monitor, args=(start_time,))
    t.daemon = True  # Daemon threads are abruptly stopped at shutdown
    t.start()


def pytest_sessionfinish(session: Any) -> None:
    """
    This function is a pytest hook that is called at the end of the test session.
    It is used to finalize and save the test reports.
    The reports are saved in a specific location defined in the suite reports.

    Args:
        session (Any): The pytest session object.
    """
    session_finish(suite_reports)


@pytest.fixture
def scores(request: Any) -> None:
    """
    This pytest fixture is responsible for retrieving the scores of the test class.
    The scores are retrieved from the test class's 'scores' attribute using the test class name.
    This fixture is essential for the pytest system as it provides the necessary scores for each test.

    Args:
        request (Any): The request object from which the test class is retrieved.

    Returns:
        None: The scores are directly passed to the test function and do not need to be returned.
    """
    test_class_name = request.node.cls.__name__
    return request.node.cls.scores.get(test_class_name)


# this is adding the dependency marker and category markers automatically from the json
def pytest_collection_modifyitems(items: Any, config: Any) -> None:
    """
    This function is a pytest hook that is called after the test collection has been performed.
    It is used to modify the collected test items based on the agent benchmark configuration.
    The function loads the agent benchmark configuration from the specified path and retrieves the regression reports.
    For each test item, it checks if the test method exists and retrieves the dependencies and categories from the test class instance.
    If the "--improve" or "--category" options are used, the dependencies are filtered based on the regression data.
    If the "--test", "--no_dep", or "--maintain" options are used, the dependencies are cleared.
    The function then dynamically adds the 'depends' and 'category' markers to the test item.
    This function is essential for the pytest system as it provides the necessary modification of the test items based on the agent benchmark configuration.

    Args:
        items (Any): The collected test items to be modified.
        config (Any): The pytest configuration object from which the agent benchmark configuration path is retrieved.
    """
    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
    try:
        with open(agent_benchmark_config_path) as f:
            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
    except json.JSONDecodeError:
        print("Error: benchmark_config.json is not a valid JSON file.")
        raise

    regression_file = agent_benchmark_config.get_regression_reports_path()
    data = (
        json.loads(open(regression_file, "r").read())
        if os.path.exists(regression_file)
        else {}
    )

    for item in items:
        # Assuming item.cls is your test class
        test_class_instance = item.cls()

        if "test_method" not in item.name:
            continue

        # Then you can access your properties
        name = item.parent.cls.__name__
        # dependencies = test_class_instance.data.dependencies

        # Filter dependencies if they exist in regression data if its an improvement test
        # if config.getoption("--improve") or config.getoption(
        #     "--category"
        # ):
        #     dependencies = [dep for dep in dependencies if not data.get(dep, None)]
        # if (
        #     config.getoption("--test")
        #     or config.getoption("--no_dep")
        #     or config.getoption("--maintain")
        # ):
        dependencies = test_class_instance.dependencies

        # Add depends marker dynamically
        item.add_marker(pytest.mark.depends(on=dependencies, name=name))

        categories = test_class_instance.data.category

        # Add category marker dynamically
        for category in categories:
            item.add_marker(getattr(pytest.mark, category))