benchmark/agbenchmark/conftest.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334

import contextlib
import json
import logging
import os
import shutil
import threading
import time
from pathlib import Path
from typing import Generator

import pytest

from agbenchmark.challenges import OPTIONAL_CATEGORIES, BaseChallenge
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.processing.report_types import Test
from agbenchmark.reports.ReportManager import RegressionTestsTracker
from agbenchmark.reports.reports import (
    add_test_result_to_report,
    make_empty_test_report,
    session_finish,
)
from agbenchmark.utils.data_types import Category

GLOBAL_TIMEOUT = (
    1500  # The tests will stop after 25 minutes so we can send the reports.
)

agbenchmark_config = AgentBenchmarkConfig.load()
logger = logging.getLogger(__name__)

pytest_plugins = ["agbenchmark.utils.dependencies"]
collect_ignore = ["challenges"]


@pytest.fixture(scope="module")
def config() -> AgentBenchmarkConfig:
    return agbenchmark_config


@pytest.fixture(autouse=True)
def temp_folder() -> Generator[Path, None, None]:
    """
    Pytest fixture that sets up and tears down the temporary folder for each test.
    It is automatically used in every test due to the 'autouse=True' parameter.
    """

    # create output directory if it doesn't exist
    if not os.path.exists(agbenchmark_config.temp_folder):
        os.makedirs(agbenchmark_config.temp_folder, exist_ok=True)

    yield agbenchmark_config.temp_folder
    # teardown after test function completes
    if not os.getenv("KEEP_TEMP_FOLDER_FILES"):
        for filename in os.listdir(agbenchmark_config.temp_folder):
            file_path = os.path.join(agbenchmark_config.temp_folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                logger.warning(f"Failed to delete {file_path}. Reason: {e}")


def pytest_addoption(parser: pytest.Parser) -> None:
    """
    Pytest hook that adds command-line options to the `pytest` command.
    The added options are specific to agbenchmark and control its behavior:
    * `--mock` is used to run the tests in mock mode.
    * `--host` is used to specify the host for the tests.
    * `--category` is used to run only tests of a specific category.
    * `--nc` is used to run the tests without caching.
    * `--cutoff` is used to specify a cutoff time for the tests.
    * `--improve` is used to run only the tests that are marked for improvement.
    * `--maintain` is used to run only the tests that are marked for maintenance.
    * `--explore` is used to run the tests in exploration mode.
    * `--test` is used to run a specific test.
    * `--no-dep` is used to run the tests without dependencies.
    * `--keep-answers` is used to keep the answers of the tests.

    Args:
        parser: The Pytest CLI parser to which the command-line options are added.
    """
    parser.addoption("-N", "--attempts", action="store")
    parser.addoption("--no-dep", action="store_true")
    parser.addoption("--mock", action="store_true")
    parser.addoption("--host", default=None)
    parser.addoption("--nc", action="store_true")
    parser.addoption("--cutoff", action="store")
    parser.addoption("--category", action="append")
    parser.addoption("--test", action="append")
    parser.addoption("--improve", action="store_true")
    parser.addoption("--maintain", action="store_true")
    parser.addoption("--explore", action="store_true")
    parser.addoption("--keep-answers", action="store_true")


def pytest_configure(config: pytest.Config) -> None:
    # Register category markers to prevent "unknown marker" warnings
    for category in Category:
        config.addinivalue_line("markers", f"{category.value}: {category}")


@pytest.fixture(autouse=True)
def check_regression(request: pytest.FixtureRequest) -> None:
    """
    Fixture that checks for every test if it should be treated as a regression test,
    and whether to skip it based on that.

    The test name is retrieved from the `request` object. Regression reports are loaded
    from the path specified in the benchmark configuration.

    Effect:
    * If the `--improve` option is used and the current test is considered a regression
      test, it is skipped.
    * If the `--maintain` option is used and the current test  is not considered a
      regression test, it is also skipped.

    Args:
        request: The request object from which the test name and the benchmark
            configuration are retrieved.
    """
    with contextlib.suppress(FileNotFoundError):
        rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)

        test_name = request.node.parent.name
        challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
        skip_string = f"Skipping {test_name} at {challenge_location}"

        # Check if the test name exists in the regression tests
        is_regression_test = rt_tracker.has_regression_test(test_name)
        if request.config.getoption("--improve") and is_regression_test:
            pytest.skip(f"{skip_string} because it's a regression test")
        elif request.config.getoption("--maintain") and not is_regression_test:
            pytest.skip(f"{skip_string} because it's not a regression test")


@pytest.fixture(autouse=True, scope="session")
def mock(request: pytest.FixtureRequest) -> bool:
    """
    Pytest fixture that retrieves the value of the `--mock` command-line option.
    The `--mock` option is used to run the tests in mock mode.

    Args:
        request: The `pytest.FixtureRequest` from which the `--mock` option value
            is retrieved.

    Returns:
        bool: Whether `--mock` is set for this session.
    """
    return request.config.getoption("--mock")


test_reports: dict[str, Test] = {}


def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
    """
    Pytest hook that is called when a test report is being generated.
    It is used to generate and finalize reports for each test.

    Args:
        item: The test item for which the report is being generated.
        call: The call object from which the test result is retrieved.
    """
    challenge: type[BaseChallenge] = item.cls  # type: ignore
    challenge_id = challenge.info.eval_id

    if challenge_id not in test_reports:
        test_reports[challenge_id] = make_empty_test_report(challenge.info)

    if call.when == "setup":
        test_name = item.nodeid.split("::")[1]
        item.user_properties.append(("test_name", test_name))

    if call.when == "call":
        add_test_result_to_report(
            test_reports[challenge_id], item, call, agbenchmark_config
        )


def timeout_monitor(start_time: int) -> None:
    """
    Function that limits the total execution time of the test suite.
    This function is supposed to be run in a separate thread and calls `pytest.exit`
    if the total execution time has exceeded the global timeout.

    Args:
        start_time (int): The start time of the test suite.
    """
    while time.time() - start_time < GLOBAL_TIMEOUT:
        time.sleep(1)  # check every second

    pytest.exit("Test suite exceeded the global timeout", returncode=1)


def pytest_sessionstart(session: pytest.Session) -> None:
    """
    Pytest hook that is called at the start of a test session.

    Sets up and runs a `timeout_monitor` in a separate thread.
    """
    start_time = time.time()
    t = threading.Thread(target=timeout_monitor, args=(start_time,))
    t.daemon = True  # Daemon threads are abruptly stopped at shutdown
    t.start()


def pytest_sessionfinish(session: pytest.Session) -> None:
    """
    Pytest hook that is called at the end of a test session.

    Finalizes and saves the test reports.
    """
    session_finish(agbenchmark_config)


def pytest_generate_tests(metafunc: pytest.Metafunc):
    if type(n := metafunc.config.getoption("-N")) is str:
        metafunc.parametrize("i_attempt", range(int(n)))


def pytest_collection_modifyitems(
    items: list[pytest.Item], config: pytest.Config
) -> None:
    """
    Pytest hook that is called after initial test collection has been performed.
    Modifies the collected test items based on the agent benchmark configuration,
    adding the dependency marker and category markers.

    Args:
        items: The collected test items to be modified.
        config: The active pytest configuration.
    """
    rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)

    try:
        challenges_beaten_in_the_past = json.loads(
            agbenchmark_config.challenges_already_beaten_file.read_bytes()
        )
    except FileNotFoundError:
        challenges_beaten_in_the_past = {}

    selected_tests: tuple[str] = config.getoption("--test")  # type: ignore
    selected_categories: tuple[str] = config.getoption("--category")  # type: ignore

    # Can't use a for-loop to remove items in-place
    i = 0
    while i < len(items):
        item = items[i]
        challenge = item.cls
        challenge_name = item.cls.__name__

        if not issubclass(challenge, BaseChallenge):
            item.warn(
                pytest.PytestCollectionWarning(
                    f"Non-challenge item collected: {challenge}"
                )
            )
            i += 1
            continue

        # --test: remove the test from the set if it's not specifically selected
        if selected_tests and challenge.info.name not in selected_tests:
            items.remove(item)
            continue

        # Filter challenges for --maintain, --improve, and --explore:
        # --maintain -> only challenges expected to be passed (= regression tests)
        # --improve -> only challenges that so far are not passed (reliably)
        # --explore -> only challenges that have never been passed
        is_regression_test = rt_tracker.has_regression_test(challenge.info.name)
        has_been_passed = challenges_beaten_in_the_past.get(challenge.info.name, False)
        if (
            (config.getoption("--maintain") and not is_regression_test)
            or (config.getoption("--improve") and is_regression_test)
            or (config.getoption("--explore") and has_been_passed)
        ):
            items.remove(item)
            continue

        dependencies = challenge.info.dependencies
        if (
            config.getoption("--test")
            or config.getoption("--no-dep")
            or config.getoption("--maintain")
        ):
            # Ignore dependencies:
            # --test -> user selected specific tests to run, don't care about deps
            # --no-dep -> ignore dependency relations regardless of test selection
            # --maintain -> all "regression" tests must pass, so run all of them
            dependencies = []
        elif config.getoption("--improve"):
            # Filter dependencies, keep only deps that are not "regression" tests
            dependencies = [
                d for d in dependencies if not rt_tracker.has_regression_test(d)
            ]

        # Set category markers
        challenge_categories = set(c.value for c in challenge.info.category)
        for category in challenge_categories:
            item.add_marker(category)

        # Enforce category selection
        if selected_categories:
            if not challenge_categories.intersection(set(selected_categories)):
                items.remove(item)
                continue
            # # Filter dependencies, keep only deps from selected categories
            # dependencies = [
            #     d for d in dependencies
            #     if not set(d.categories).intersection(set(selected_categories))
            # ]

        # Skip items in optional categories that are not selected for the subject agent
        challenge_optional_categories = challenge_categories & set(OPTIONAL_CATEGORIES)
        if challenge_optional_categories and not (
            agbenchmark_config.categories
            and challenge_optional_categories.issubset(
                set(agbenchmark_config.categories)
            )
        ):
            logger.debug(
                f"Skipping {challenge_name}: "
                f"category {' and '.join(challenge_optional_categories)} is optional, "
                "and not explicitly selected in the benchmark config."
            )
            items.remove(item)
            continue

        # Add marker for the DependencyManager
        item.add_marker(pytest.mark.depends(on=dependencies, name=challenge_name))

        i += 1