1 files changed, 212 insertions, 278 deletions
diff --git a/benchmark/agbenchmark/conftest.py b/benchmark/agbenchmark/conftest.py
index 33b0809ac..7e417a603 100644
--- a/benchmark/agbenchmark/conftest.py
+++ b/benchmark/agbenchmark/conftest.py
@@ -1,282 +1,189 @@
 import contextlib
 import json
+import logging
 import os
 import shutil
-import sys
 import threading
 import time
-from pathlib import Path  # noqa
-from typing import Any, Generator
+from pathlib import Path
+from typing import Generator
 
 import pytest
 
-from agbenchmark.__main__ import TEMP_FOLDER_ABS_PATH
+from agbenchmark.challenges import OPTIONAL_CATEGORIES, BaseChallenge
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.reports.processing.report_types import Test
+from agbenchmark.reports.ReportManager import RegressionTestsTracker
 from agbenchmark.reports.reports import (
-    finalize_reports,
-    generate_single_call_report,
+    add_test_result_to_report,
+    make_empty_test_report,
     session_finish,
 )
-from agbenchmark.utils.data_types import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category
 
 GLOBAL_TIMEOUT = (
     1500  # The tests will stop after 25 minutes so we can send the reports.
 )
 
+agbenchmark_config = AgentBenchmarkConfig.load()
+logger = logging.getLogger(__name__)
+
 pytest_plugins = ["agbenchmark.utils.dependencies"]
 collect_ignore = ["challenges"]
-suite_reports: dict[str, list] = {}
-
-
-def load_config_from_request(request: Any) -> AgentBenchmarkConfig:
-    """
-    This function loads the configuration for the agent benchmark from a given request.
-
-    Args:
-        request (Any): The request object from which the agent benchmark configuration is to be loaded.
-
-    Returns:
-        AgentBenchmarkConfig: The loaded agent benchmark configuration.
-
-    Raises:
-        json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
-    """
-    agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json"
-    try:
-        with open(agent_benchmark_config_path, "r") as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = (
-                agent_benchmark_config_path
-            )
-            return agent_benchmark_config
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
 
 
 @pytest.fixture(scope="module")
-def config(request: Any) -> Any:
-    """
-    This pytest fixture is responsible for loading the agent benchmark configuration from a given request.
-    This fixture is scoped to the module level, meaning it's invoked once per test module.
-
-    Args:
-        request (Any): The request object from which the agent benchmark configuration is to be loaded.
-
-    Returns:
-        Any: The loaded configuration dictionary.
-
-    Raises:
-        json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file.
-    """
-    config = {}
-    agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json"
-    try:
-        with open(agent_benchmark_config_path, "r") as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-            agent_benchmark_config.agent_benchmark_config_path = (
-                agent_benchmark_config_path
-            )
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
-
-    config["AgentBenchmarkConfig"] = agent_benchmark_config
-
-    return config
+def config() -> AgentBenchmarkConfig:
+    return agbenchmark_config
 
 
 @pytest.fixture(autouse=True)
-def temp_folder() -> Generator[str, None, None]:
+def temp_folder() -> Generator[Path, None, None]:
     """
-    This pytest fixture is responsible for setting up and tearing down the temporary folder for each test.
+    Pytest fixture that sets up and tears down the temporary folder for each test.
     It is automatically used in every test due to the 'autouse=True' parameter.
-    It is used in order to let agbenchmark store files so they can then be evaluated.
     """
 
     # create output directory if it doesn't exist
-    if not os.path.exists(TEMP_FOLDER_ABS_PATH):
-        os.makedirs(TEMP_FOLDER_ABS_PATH, exist_ok=True)
+    if not os.path.exists(agbenchmark_config.temp_folder):
+        os.makedirs(agbenchmark_config.temp_folder, exist_ok=True)
 
-    yield
+    yield agbenchmark_config.temp_folder
     # teardown after test function completes
     if not os.getenv("KEEP_TEMP_FOLDER_FILES"):
-        for filename in os.listdir(TEMP_FOLDER_ABS_PATH):
-            file_path = os.path.join(TEMP_FOLDER_ABS_PATH, filename)
+        for filename in os.listdir(agbenchmark_config.temp_folder):
+            file_path = os.path.join(agbenchmark_config.temp_folder, filename)
             try:
                 if os.path.isfile(file_path) or os.path.islink(file_path):
                     os.unlink(file_path)
                 elif os.path.isdir(file_path):
                     shutil.rmtree(file_path)
             except Exception as e:
-                print(f"Failed to delete {file_path}. Reason: {e}")
+                logger.warning(f"Failed to delete {file_path}. Reason: {e}")
 
 
-def pytest_addoption(parser: Any) -> None:
+def pytest_addoption(parser: pytest.Parser) -> None:
     """
-    This function is a pytest hook that is called to add command-line options.
-    It is used to add custom command-line options that are specific to the agent benchmark tests.
-    These options can be used to control the behavior of the tests.
-    The "--mock" option is used to run the tests in mock mode.
-    The "--host" option is used to specify the host for the tests.
-    The "--category" option is used to run only tests of a specific category.
-    The "--nc" option is used to run the tests without caching.
-    The "--cutoff" option is used to specify a cutoff time for the tests.
-    The "--improve" option is used to run only the tests that are marked for improvement.
-    The "--maintain" option is used to run only the tests that are marked for maintenance.
-    The "--explore" option is used to run the tests in exploration mode.
-    The "--test" option is used to run a specific test.
-    The "--no_dep" option is used to run the tests without dependencies.
-    The "--keep_answers" option is used to keep the answers of the tests.
+    Pytest hook that adds command-line options to the `pytest` command.
+    The added options are specific to agbenchmark and control its behavior:
+    * `--mock` is used to run the tests in mock mode.
+    * `--host` is used to specify the host for the tests.
+    * `--category` is used to run only tests of a specific category.
+    * `--nc` is used to run the tests without caching.
+    * `--cutoff` is used to specify a cutoff time for the tests.
+    * `--improve` is used to run only the tests that are marked for improvement.
+    * `--maintain` is used to run only the tests that are marked for maintenance.
+    * `--explore` is used to run the tests in exploration mode.
+    * `--test` is used to run a specific test.
+    * `--no-dep` is used to run the tests without dependencies.
+    * `--keep-answers` is used to keep the answers of the tests.
 
     Args:
-        parser (Any): The parser object to which the command-line options are added.
+        parser: The Pytest CLI parser to which the command-line options are added.
     """
-    parser.addoption("--no_dep", action="store_true", default=False)
-    parser.addoption("--mock", action="store_true", default=False)
-    parser.addoption("--host", action="store_true", default=None)
-    parser.addoption("--nc", action="store_true", default=False)
-    parser.addoption("--cutoff", action="store_true", default=False)
-    parser.addoption("--category", action="store_true", default=False)
-    parser.addoption("--test", action="store_true", default=None)
-    parser.addoption("--improve", action="store_true", default=False)
-    parser.addoption("--maintain", action="store_true", default=False)
-    parser.addoption("--explore", action="store_true", default=False)
-    parser.addoption("--keep-answers", action="store_true", default=False)
+    parser.addoption("-N", "--attempts", action="store")
+    parser.addoption("--no-dep", action="store_true")
+    parser.addoption("--mock", action="store_true")
+    parser.addoption("--host", default=None)
+    parser.addoption("--nc", action="store_true")
+    parser.addoption("--cutoff", action="store")
+    parser.addoption("--category", action="append")
+    parser.addoption("--test", action="append")
+    parser.addoption("--improve", action="store_true")
+    parser.addoption("--maintain", action="store_true")
+    parser.addoption("--explore", action="store_true")
+    parser.addoption("--keep-answers", action="store_true")
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    # Register category markers to prevent "unknown marker" warnings
+    for category in Category:
+        config.addinivalue_line("markers", f"{category.value}: {category}")
 
 
 @pytest.fixture(autouse=True)
-def check_regression(request: Any) -> None:
+def check_regression(request: pytest.FixtureRequest) -> None:
     """
-    This pytest fixture is responsible for checking if a test is a regression test.
-    It is automatically used in every test due to the 'autouse=True' parameter.
-    The test name and the agent benchmark configuration are retrieved from the request object.
-    The regression reports are loaded from the path specified in the agent benchmark configuration.
-    If the "--improve" option is used and the test name exists in the regression tests, the test is skipped.
-    If the "--maintain" option is used and the test name does not exist in the regression tests, the test is also skipped.
+    Fixture that checks for every test if it should be treated as a regression test,
+    and whether to skip it based on that.
+
+    The test name is retrieved from the `request` object. Regression reports are loaded
+    from the path specified in the benchmark configuration.
+
+    Effect:
+    * If the `--improve` option is used and the current test is considered a regression
+      test, it is skipped.
+    * If the `--maintain` option is used and the current test  is not considered a
+      regression test, it is also skipped.
 
     Args:
-        request (Any): The request object from which the test name and the agent benchmark configuration are retrieved.
+        request: The request object from which the test name and the benchmark
+            configuration are retrieved.
     """
-    test_name = request.node.parent.name
-    agent_benchmark_config = load_config_from_request(request)
-    with contextlib.suppress(Exception):
-        test = agent_benchmark_config.get_regression_reports_path()
-        print(f"Found a test {test}")
-        data = json.loads(test)
-        print(f"Got its data {data}")
-        challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
+    with contextlib.suppress(FileNotFoundError):
+        rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)
 
+        test_name = request.node.parent.name
+        challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
         skip_string = f"Skipping {test_name} at {challenge_location}"
 
         # Check if the test name exists in the regression tests
-        if request.config.getoption("--improve") and data.get(test_name, None):
+        is_regression_test = rt_tracker.has_regression_test(test_name)
+        if request.config.getoption("--improve") and is_regression_test:
             pytest.skip(f"{skip_string} because it's a regression test")
-        elif request.config.getoption("--maintain") and not data.get(test_name, None):
+        elif request.config.getoption("--maintain") and not is_regression_test:
             pytest.skip(f"{skip_string} because it's not a regression test")
 
 
-# this is to get the challenge_data from every test
-@pytest.fixture(autouse=True)
-def challenge_data(request: Any) -> None:
-    """
-    This pytest fixture is responsible for providing the challenge data for each test.
-    It is automatically used in every test due to the 'autouse=True' parameter.
-    The challenge data is retrieved from the request object's parameters.
-    This fixture is essential for the pytest system as it provides the necessary data for each test.
-
-    Args:
-        request (Any): The request object from which the challenge data is retrieved.
-
-    Returns:
-        None: The challenge data is directly passed to the test function and does not need to be returned.
-    """
-    print(f"REQUEST DATA HERE {request}")
-    return request.param
-
-
 @pytest.fixture(autouse=True, scope="session")
-def mock(request: Any) -> None:
+def mock(request: pytest.FixtureRequest) -> bool:
     """
-    This pytest fixture is responsible for retrieving the value of the "--mock" command-line option.
-    It is automatically used in every test session due to the 'autouse=True' parameter and 'session' scope.
-    The "--mock" option is used to run the tests in mock mode.
-    This fixture is essential for the pytest system as it provides the necessary command-line option value for each test session.
+    Pytest fixture that retrieves the value of the `--mock` command-line option.
+    The `--mock` option is used to run the tests in mock mode.
 
     Args:
-        request (Any): The request object from which the "--mock" option value is retrieved.
+        request: The `pytest.FixtureRequest` from which the `--mock` option value
+            is retrieved.
 
     Returns:
-        None: The "--mock" option value is directly passed to the test session and does not need to be returned.
+        bool: Whether `--mock` is set for this session.
     """
     return request.config.getoption("--mock")
 
 
-@pytest.fixture(autouse=True, scope="function")
-def timer(request: Any) -> Any:
-    """
-    This pytest fixture is responsible for timing the execution of each test.
-    It is automatically used in every test due to the 'autouse=True' parameter and 'function' scope.
-    At the start of each test, it records the current time.
-    After the test function completes, it calculates the run time and appends it to the test node's user properties.
-    This allows the run time of each test to be accessed later for reporting or analysis.
-
-    Args:
-        request (Any): The request object from which the test node is retrieved.
-
-    Yields:
-        None: Control is yielded back to the test function.
-    """
-    start_time = time.time()
-    yield
-    run_time = time.time() - start_time
-    request.node.user_properties.append(("run_time", run_time))
+test_reports: dict[str, Test] = {}
 
 
-def pytest_runtest_makereport(item: Any, call: Any) -> None:
+def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None:
     """
-    This function is a pytest hook that is called when a test report is being generated.
+    Pytest hook that is called when a test report is being generated.
     It is used to generate and finalize reports for each test.
 
     Args:
-        item (Any): The test item for which the report is being generated.
-        call (Any): The call object from which the test result is retrieved.
+        item: The test item for which the report is being generated.
+        call: The call object from which the test result is retrieved.
     """
-    challenge_data = item.funcargs.get("challenge_data", None)
+    challenge: type[BaseChallenge] = item.cls  # type: ignore
+    challenge_id = challenge.info.eval_id
 
-    print(f"pytest_runtest_makereport Challenge data: {challenge_data}")
+    if challenge_id not in test_reports:
+        test_reports[challenge_id] = make_empty_test_report(challenge.info)
 
-    if not challenge_data:
-        # this will only happen for dummy dependency setup tests
-        return
-
-    challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
-
-    flags = (
-        "--test" in sys.argv
-        or "--maintain" in sys.argv
-        or "--improve" in sys.argv
-        or "--explore" in sys.argv
-    )
-
-    if call.when == "call":
-        answers = getattr(item, "answers", None)
-        challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
+    if call.when == "setup":
         test_name = item.nodeid.split("::")[1]
-        item.test_name = test_name
+        item.user_properties.append(("test_name", test_name))
 
-        generate_single_call_report(
-            item, call, challenge_data, answers, challenge_location, test_name
+    if call.when == "call":
+        add_test_result_to_report(
+            test_reports[challenge_id], item, call, agbenchmark_config
         )
 
-    if call.when == "teardown":
-        finalize_reports(item, challenge_data)
-
 
 def timeout_monitor(start_time: int) -> None:
     """
-    This function is responsible for monitoring the total execution time of the test suite.
-    It runs in a separate thread and checks every second if the total execution time has exceeded the global timeout.
-    If the global timeout is exceeded, it terminates the pytest session with a specific return code.
+    Function that limits the total execution time of the test suite.
+    This function is supposed to be run in a separate thread and calls `pytest.exit`
+    if the total execution time has exceeded the global timeout.
 
     Args:
         start_time (int): The start time of the test suite.
@@ -287,14 +194,11 @@ def timeout_monitor(start_time: int) -> None:
     pytest.exit("Test suite exceeded the global timeout", returncode=1)
 
 
-def pytest_sessionstart(session: Any) -> None:
+def pytest_sessionstart(session: pytest.Session) -> None:
     """
-    This function is a pytest hook that is called at the start of the test session.
-    It starts the timeout monitor in a separate thread.
-    The timeout monitor checks if the total execution time of the test suite has exceeded the global timeout.
+    Pytest hook that is called at the start of a test session.
 
-    Args:
-        session (Any): The pytest session object.
+    Sets up and runs a `timeout_monitor` in a separate thread.
     """
     start_time = time.time()
     t = threading.Thread(target=timeout_monitor, args=(start_time,))
@@ -302,99 +206,129 @@ def pytest_sessionstart(session: Any) -> None:
     t.start()
 
 
-def pytest_sessionfinish(session: Any) -> None:
-    """
-    This function is a pytest hook that is called at the end of the test session.
-    It is used to finalize and save the test reports.
-    The reports are saved in a specific location defined in the suite reports.
-
-    Args:
-        session (Any): The pytest session object.
+def pytest_sessionfinish(session: pytest.Session) -> None:
     """
-    session_finish(suite_reports)
-
+    Pytest hook that is called at the end of a test session.
 
-@pytest.fixture
-def scores(request: Any) -> None:
+    Finalizes and saves the test reports.
     """
-    This pytest fixture is responsible for retrieving the scores of the test class.
-    The scores are retrieved from the test class's 'scores' attribute using the test class name.
-    This fixture is essential for the pytest system as it provides the necessary scores for each test.
+    session_finish(agbenchmark_config)
 
-    Args:
-        request (Any): The request object from which the test class is retrieved.
 
-    Returns:
-        None: The scores are directly passed to the test function and do not need to be returned.
-    """
-    test_class_name = request.node.cls.__name__
-    return request.node.cls.scores.get(test_class_name)
+def pytest_generate_tests(metafunc: pytest.Metafunc):
+    if type(n := metafunc.config.getoption("-N")) is str:
+        metafunc.parametrize("i_attempt", range(int(n)))
 
 
-# this is adding the dependency marker and category markers automatically from the json
-def pytest_collection_modifyitems(items: Any, config: Any) -> None:
+def pytest_collection_modifyitems(
+    items: list[pytest.Item], config: pytest.Config
+) -> None:
     """
-    This function is a pytest hook that is called after the test collection has been performed.
-    It is used to modify the collected test items based on the agent benchmark configuration.
-    The function loads the agent benchmark configuration from the specified path and retrieves the regression reports.
-    For each test item, it checks if the test method exists and retrieves the dependencies and categories from the test class instance.
-    If the "--improve" or "--category" options are used, the dependencies are filtered based on the regression data.
-    If the "--test", "--no_dep", or "--maintain" options are used, the dependencies are cleared.
-    The function then dynamically adds the 'depends' and 'category' markers to the test item.
-    This function is essential for the pytest system as it provides the necessary modification of the test items based on the agent benchmark configuration.
+    Pytest hook that is called after initial test collection has been performed.
+    Modifies the collected test items based on the agent benchmark configuration,
+    adding the dependency marker and category markers.
 
     Args:
-        items (Any): The collected test items to be modified.
-        config (Any): The pytest configuration object from which the agent benchmark configuration path is retrieved.
+        items: The collected test items to be modified.
+        config: The active pytest configuration.
     """
-    agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
-    try:
-        with open(agent_benchmark_config_path) as f:
-            agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
-    except json.JSONDecodeError:
-        print("Error: benchmark_config.json is not a valid JSON file.")
-        raise
+    rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file)
 
-    regression_file = agent_benchmark_config.get_regression_reports_path()
-    data = (
-        json.loads(open(regression_file, "r").read())
-        if os.path.exists(regression_file)
-        else {}
-    )
-
-    print(f"data??? {data}")
+    try:
+        challenges_beaten_in_the_past = json.loads(
+            agbenchmark_config.challenges_already_beaten_file.read_bytes()
+        )
+    except FileNotFoundError:
+        challenges_beaten_in_the_past = {}
+
+    selected_tests: tuple[str] = config.getoption("--test")  # type: ignore
+    selected_categories: tuple[str] = config.getoption("--category")  # type: ignore
+
+    # Can't use a for-loop to remove items in-place
+    i = 0
+    while i < len(items):
+        item = items[i]
+        challenge = item.cls
+        challenge_name = item.cls.__name__
+
+        if not issubclass(challenge, BaseChallenge):
+            item.warn(
+                pytest.PytestCollectionWarning(
+                    f"Non-challenge item collected: {challenge}"
+                )
+            )
+            i += 1
+            continue
 
-    for item in items:
-        # Assuming item.cls is your test class
-        test_class_instance = item.cls()
+        # --test: remove the test from the set if it's not specifically selected
+        if selected_tests and challenge.info.name not in selected_tests:
+            items.remove(item)
+            continue
 
-        print(f"item: {item!r}")
+        # Filter challenges for --maintain, --improve, and --explore:
+        # --maintain -> only challenges expected to be passed (= regression tests)
+        # --improve -> only challenges that so far are not passed (reliably)
+        # --explore -> only challenges that have never been passed
+        is_regression_test = rt_tracker.has_regression_test(challenge.info.name)
+        has_been_passed = challenges_beaten_in_the_past.get(challenge.info.name, False)
+        if (
+            (config.getoption("--maintain") and not is_regression_test)
+            or (config.getoption("--improve") and is_regression_test)
+            or (config.getoption("--explore") and has_been_passed)
+        ):
+            items.remove(item)
+            continue
 
-        if "test_method" not in item.name:
+        dependencies = challenge.info.dependencies
+        if (
+            config.getoption("--test")
+            or config.getoption("--no-dep")
+            or config.getoption("--maintain")
+        ):
+            # Ignore dependencies:
+            # --test -> user selected specific tests to run, don't care about deps
+            # --no-dep -> ignore dependency relations regardless of test selection
+            # --maintain -> all "regression" tests must pass, so run all of them
+            dependencies = []
+        elif config.getoption("--improve"):
+            # Filter dependencies, keep only deps that are not "regression" tests
+            dependencies = [
+                d for d in dependencies if not rt_tracker.has_regression_test(d)
+            ]
+
+        # Set category markers
+        challenge_categories = set(c.value for c in challenge.info.category)
+        for category in challenge_categories:
+            item.add_marker(category)
+
+        # Enforce category selection
+        if selected_categories:
+            if not challenge_categories.intersection(set(selected_categories)):
+                items.remove(item)
+                continue
+            # # Filter dependencies, keep only deps from selected categories
+            # dependencies = [
+            #     d for d in dependencies
+            #     if not set(d.categories).intersection(set(selected_categories))
+            # ]
+
+        # Skip items in optional categories that are not selected for the subject agent
+        challenge_optional_categories = challenge_categories & set(OPTIONAL_CATEGORIES)
+        if challenge_optional_categories and not (
+            agbenchmark_config.categories
+            and challenge_optional_categories.issubset(
+                set(agbenchmark_config.categories)
+            )
+        ):
+            logger.debug(
+                f"Skipping {challenge_name}: "
+                f"category {' and '.join(challenge_optional_categories)} is optional, "
+                "and not explicitly selected in the benchmark config."
+            )
+            items.remove(item)
             continue
 
-        # Then you can access your properties
-        name = item.parent.cls.__name__
-        # dependencies = test_class_instance.data.dependencies
-
-        # Filter dependencies if they exist in regression data if its an improvement test
-        # if config.getoption("--improve") or config.getoption(
-        #     "--category"
-        # ):
-        #     dependencies = [dep for dep in dependencies if not data.get(dep, None)]
-        # if (
-        #     config.getoption("--test")
-        #     or config.getoption("--no_dep")
-        #     or config.getoption("--maintain")
-        # ):
-        print(f"test_class_instance: {test_class_instance!r}")
-        dependencies = test_class_instance.dependencies
-
-        # Add depends marker dynamically
-        item.add_marker(pytest.mark.depends(on=dependencies, name=name))
-
-        categories = test_class_instance.data.category
-
-        # Add category marker dynamically
-        for category in categories:
-            item.add_marker(getattr(pytest.mark, category))
+        # Add marker for the DependencyManager
+        item.add_marker(pytest.mark.depends(on=dependencies, name=challenge_name))
+
+        i += 1