diff options
Diffstat (limited to 'benchmark/agbenchmark/conftest.py')
-rw-r--r-- | benchmark/agbenchmark/conftest.py | 490 |
1 files changed, 212 insertions, 278 deletions
diff --git a/benchmark/agbenchmark/conftest.py b/benchmark/agbenchmark/conftest.py index 33b0809ac..7e417a603 100644 --- a/benchmark/agbenchmark/conftest.py +++ b/benchmark/agbenchmark/conftest.py @@ -1,282 +1,189 @@ import contextlib import json +import logging import os import shutil -import sys import threading import time -from pathlib import Path # noqa -from typing import Any, Generator +from pathlib import Path +from typing import Generator import pytest -from agbenchmark.__main__ import TEMP_FOLDER_ABS_PATH +from agbenchmark.challenges import OPTIONAL_CATEGORIES, BaseChallenge +from agbenchmark.config import AgentBenchmarkConfig +from agbenchmark.reports.processing.report_types import Test +from agbenchmark.reports.ReportManager import RegressionTestsTracker from agbenchmark.reports.reports import ( - finalize_reports, - generate_single_call_report, + add_test_result_to_report, + make_empty_test_report, session_finish, ) -from agbenchmark.utils.data_types import AgentBenchmarkConfig +from agbenchmark.utils.data_types import Category GLOBAL_TIMEOUT = ( 1500 # The tests will stop after 25 minutes so we can send the reports. ) +agbenchmark_config = AgentBenchmarkConfig.load() +logger = logging.getLogger(__name__) + pytest_plugins = ["agbenchmark.utils.dependencies"] collect_ignore = ["challenges"] -suite_reports: dict[str, list] = {} - - -def load_config_from_request(request: Any) -> AgentBenchmarkConfig: - """ - This function loads the configuration for the agent benchmark from a given request. - - Args: - request (Any): The request object from which the agent benchmark configuration is to be loaded. - - Returns: - AgentBenchmarkConfig: The loaded agent benchmark configuration. - - Raises: - json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file. - """ - agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json" - try: - with open(agent_benchmark_config_path, "r") as f: - agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = ( - agent_benchmark_config_path - ) - return agent_benchmark_config - except json.JSONDecodeError: - print("Error: benchmark_config.json is not a valid JSON file.") - raise @pytest.fixture(scope="module") -def config(request: Any) -> Any: - """ - This pytest fixture is responsible for loading the agent benchmark configuration from a given request. - This fixture is scoped to the module level, meaning it's invoked once per test module. - - Args: - request (Any): The request object from which the agent benchmark configuration is to be loaded. - - Returns: - Any: The loaded configuration dictionary. - - Raises: - json.JSONDecodeError: If the benchmark configuration file is not a valid JSON file. - """ - config = {} - agent_benchmark_config_path = Path.cwd() / "agbenchmark_config" / "config.json" - try: - with open(agent_benchmark_config_path, "r") as f: - agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = ( - agent_benchmark_config_path - ) - except json.JSONDecodeError: - print("Error: benchmark_config.json is not a valid JSON file.") - raise - - config["AgentBenchmarkConfig"] = agent_benchmark_config - - return config +def config() -> AgentBenchmarkConfig: + return agbenchmark_config @pytest.fixture(autouse=True) -def temp_folder() -> Generator[str, None, None]: +def temp_folder() -> Generator[Path, None, None]: """ - This pytest fixture is responsible for setting up and tearing down the temporary folder for each test. + Pytest fixture that sets up and tears down the temporary folder for each test. It is automatically used in every test due to the 'autouse=True' parameter. - It is used in order to let agbenchmark store files so they can then be evaluated. """ # create output directory if it doesn't exist - if not os.path.exists(TEMP_FOLDER_ABS_PATH): - os.makedirs(TEMP_FOLDER_ABS_PATH, exist_ok=True) + if not os.path.exists(agbenchmark_config.temp_folder): + os.makedirs(agbenchmark_config.temp_folder, exist_ok=True) - yield + yield agbenchmark_config.temp_folder # teardown after test function completes if not os.getenv("KEEP_TEMP_FOLDER_FILES"): - for filename in os.listdir(TEMP_FOLDER_ABS_PATH): - file_path = os.path.join(TEMP_FOLDER_ABS_PATH, filename) + for filename in os.listdir(agbenchmark_config.temp_folder): + file_path = os.path.join(agbenchmark_config.temp_folder, filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: - print(f"Failed to delete {file_path}. Reason: {e}") + logger.warning(f"Failed to delete {file_path}. Reason: {e}") -def pytest_addoption(parser: Any) -> None: +def pytest_addoption(parser: pytest.Parser) -> None: """ - This function is a pytest hook that is called to add command-line options. - It is used to add custom command-line options that are specific to the agent benchmark tests. - These options can be used to control the behavior of the tests. - The "--mock" option is used to run the tests in mock mode. - The "--host" option is used to specify the host for the tests. - The "--category" option is used to run only tests of a specific category. - The "--nc" option is used to run the tests without caching. - The "--cutoff" option is used to specify a cutoff time for the tests. - The "--improve" option is used to run only the tests that are marked for improvement. - The "--maintain" option is used to run only the tests that are marked for maintenance. - The "--explore" option is used to run the tests in exploration mode. - The "--test" option is used to run a specific test. - The "--no_dep" option is used to run the tests without dependencies. - The "--keep_answers" option is used to keep the answers of the tests. + Pytest hook that adds command-line options to the `pytest` command. + The added options are specific to agbenchmark and control its behavior: + * `--mock` is used to run the tests in mock mode. + * `--host` is used to specify the host for the tests. + * `--category` is used to run only tests of a specific category. + * `--nc` is used to run the tests without caching. + * `--cutoff` is used to specify a cutoff time for the tests. + * `--improve` is used to run only the tests that are marked for improvement. + * `--maintain` is used to run only the tests that are marked for maintenance. + * `--explore` is used to run the tests in exploration mode. + * `--test` is used to run a specific test. + * `--no-dep` is used to run the tests without dependencies. + * `--keep-answers` is used to keep the answers of the tests. Args: - parser (Any): The parser object to which the command-line options are added. + parser: The Pytest CLI parser to which the command-line options are added. """ - parser.addoption("--no_dep", action="store_true", default=False) - parser.addoption("--mock", action="store_true", default=False) - parser.addoption("--host", action="store_true", default=None) - parser.addoption("--nc", action="store_true", default=False) - parser.addoption("--cutoff", action="store_true", default=False) - parser.addoption("--category", action="store_true", default=False) - parser.addoption("--test", action="store_true", default=None) - parser.addoption("--improve", action="store_true", default=False) - parser.addoption("--maintain", action="store_true", default=False) - parser.addoption("--explore", action="store_true", default=False) - parser.addoption("--keep-answers", action="store_true", default=False) + parser.addoption("-N", "--attempts", action="store") + parser.addoption("--no-dep", action="store_true") + parser.addoption("--mock", action="store_true") + parser.addoption("--host", default=None) + parser.addoption("--nc", action="store_true") + parser.addoption("--cutoff", action="store") + parser.addoption("--category", action="append") + parser.addoption("--test", action="append") + parser.addoption("--improve", action="store_true") + parser.addoption("--maintain", action="store_true") + parser.addoption("--explore", action="store_true") + parser.addoption("--keep-answers", action="store_true") + + +def pytest_configure(config: pytest.Config) -> None: + # Register category markers to prevent "unknown marker" warnings + for category in Category: + config.addinivalue_line("markers", f"{category.value}: {category}") @pytest.fixture(autouse=True) -def check_regression(request: Any) -> None: +def check_regression(request: pytest.FixtureRequest) -> None: """ - This pytest fixture is responsible for checking if a test is a regression test. - It is automatically used in every test due to the 'autouse=True' parameter. - The test name and the agent benchmark configuration are retrieved from the request object. - The regression reports are loaded from the path specified in the agent benchmark configuration. - If the "--improve" option is used and the test name exists in the regression tests, the test is skipped. - If the "--maintain" option is used and the test name does not exist in the regression tests, the test is also skipped. + Fixture that checks for every test if it should be treated as a regression test, + and whether to skip it based on that. + + The test name is retrieved from the `request` object. Regression reports are loaded + from the path specified in the benchmark configuration. + + Effect: + * If the `--improve` option is used and the current test is considered a regression + test, it is skipped. + * If the `--maintain` option is used and the current test is not considered a + regression test, it is also skipped. Args: - request (Any): The request object from which the test name and the agent benchmark configuration are retrieved. + request: The request object from which the test name and the benchmark + configuration are retrieved. """ - test_name = request.node.parent.name - agent_benchmark_config = load_config_from_request(request) - with contextlib.suppress(Exception): - test = agent_benchmark_config.get_regression_reports_path() - print(f"Found a test {test}") - data = json.loads(test) - print(f"Got its data {data}") - challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "") + with contextlib.suppress(FileNotFoundError): + rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file) + test_name = request.node.parent.name + challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "") skip_string = f"Skipping {test_name} at {challenge_location}" # Check if the test name exists in the regression tests - if request.config.getoption("--improve") and data.get(test_name, None): + is_regression_test = rt_tracker.has_regression_test(test_name) + if request.config.getoption("--improve") and is_regression_test: pytest.skip(f"{skip_string} because it's a regression test") - elif request.config.getoption("--maintain") and not data.get(test_name, None): + elif request.config.getoption("--maintain") and not is_regression_test: pytest.skip(f"{skip_string} because it's not a regression test") -# this is to get the challenge_data from every test -@pytest.fixture(autouse=True) -def challenge_data(request: Any) -> None: - """ - This pytest fixture is responsible for providing the challenge data for each test. - It is automatically used in every test due to the 'autouse=True' parameter. - The challenge data is retrieved from the request object's parameters. - This fixture is essential for the pytest system as it provides the necessary data for each test. - - Args: - request (Any): The request object from which the challenge data is retrieved. - - Returns: - None: The challenge data is directly passed to the test function and does not need to be returned. - """ - print(f"REQUEST DATA HERE {request}") - return request.param - - @pytest.fixture(autouse=True, scope="session") -def mock(request: Any) -> None: +def mock(request: pytest.FixtureRequest) -> bool: """ - This pytest fixture is responsible for retrieving the value of the "--mock" command-line option. - It is automatically used in every test session due to the 'autouse=True' parameter and 'session' scope. - The "--mock" option is used to run the tests in mock mode. - This fixture is essential for the pytest system as it provides the necessary command-line option value for each test session. + Pytest fixture that retrieves the value of the `--mock` command-line option. + The `--mock` option is used to run the tests in mock mode. Args: - request (Any): The request object from which the "--mock" option value is retrieved. + request: The `pytest.FixtureRequest` from which the `--mock` option value + is retrieved. Returns: - None: The "--mock" option value is directly passed to the test session and does not need to be returned. + bool: Whether `--mock` is set for this session. """ return request.config.getoption("--mock") -@pytest.fixture(autouse=True, scope="function") -def timer(request: Any) -> Any: - """ - This pytest fixture is responsible for timing the execution of each test. - It is automatically used in every test due to the 'autouse=True' parameter and 'function' scope. - At the start of each test, it records the current time. - After the test function completes, it calculates the run time and appends it to the test node's user properties. - This allows the run time of each test to be accessed later for reporting or analysis. - - Args: - request (Any): The request object from which the test node is retrieved. - - Yields: - None: Control is yielded back to the test function. - """ - start_time = time.time() - yield - run_time = time.time() - start_time - request.node.user_properties.append(("run_time", run_time)) +test_reports: dict[str, Test] = {} -def pytest_runtest_makereport(item: Any, call: Any) -> None: +def pytest_runtest_makereport(item: pytest.Item, call: pytest.CallInfo) -> None: """ - This function is a pytest hook that is called when a test report is being generated. + Pytest hook that is called when a test report is being generated. It is used to generate and finalize reports for each test. Args: - item (Any): The test item for which the report is being generated. - call (Any): The call object from which the test result is retrieved. + item: The test item for which the report is being generated. + call: The call object from which the test result is retrieved. """ - challenge_data = item.funcargs.get("challenge_data", None) + challenge: type[BaseChallenge] = item.cls # type: ignore + challenge_id = challenge.info.eval_id - print(f"pytest_runtest_makereport Challenge data: {challenge_data}") + if challenge_id not in test_reports: + test_reports[challenge_id] = make_empty_test_report(challenge.info) - if not challenge_data: - # this will only happen for dummy dependency setup tests - return - - challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") - - flags = ( - "--test" in sys.argv - or "--maintain" in sys.argv - or "--improve" in sys.argv - or "--explore" in sys.argv - ) - - if call.when == "call": - answers = getattr(item, "answers", None) - challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") + if call.when == "setup": test_name = item.nodeid.split("::")[1] - item.test_name = test_name + item.user_properties.append(("test_name", test_name)) - generate_single_call_report( - item, call, challenge_data, answers, challenge_location, test_name + if call.when == "call": + add_test_result_to_report( + test_reports[challenge_id], item, call, agbenchmark_config ) - if call.when == "teardown": - finalize_reports(item, challenge_data) - def timeout_monitor(start_time: int) -> None: """ - This function is responsible for monitoring the total execution time of the test suite. - It runs in a separate thread and checks every second if the total execution time has exceeded the global timeout. - If the global timeout is exceeded, it terminates the pytest session with a specific return code. + Function that limits the total execution time of the test suite. + This function is supposed to be run in a separate thread and calls `pytest.exit` + if the total execution time has exceeded the global timeout. Args: start_time (int): The start time of the test suite. @@ -287,14 +194,11 @@ def timeout_monitor(start_time: int) -> None: pytest.exit("Test suite exceeded the global timeout", returncode=1) -def pytest_sessionstart(session: Any) -> None: +def pytest_sessionstart(session: pytest.Session) -> None: """ - This function is a pytest hook that is called at the start of the test session. - It starts the timeout monitor in a separate thread. - The timeout monitor checks if the total execution time of the test suite has exceeded the global timeout. + Pytest hook that is called at the start of a test session. - Args: - session (Any): The pytest session object. + Sets up and runs a `timeout_monitor` in a separate thread. """ start_time = time.time() t = threading.Thread(target=timeout_monitor, args=(start_time,)) @@ -302,99 +206,129 @@ def pytest_sessionstart(session: Any) -> None: t.start() -def pytest_sessionfinish(session: Any) -> None: - """ - This function is a pytest hook that is called at the end of the test session. - It is used to finalize and save the test reports. - The reports are saved in a specific location defined in the suite reports. - - Args: - session (Any): The pytest session object. +def pytest_sessionfinish(session: pytest.Session) -> None: """ - session_finish(suite_reports) - + Pytest hook that is called at the end of a test session. -@pytest.fixture -def scores(request: Any) -> None: + Finalizes and saves the test reports. """ - This pytest fixture is responsible for retrieving the scores of the test class. - The scores are retrieved from the test class's 'scores' attribute using the test class name. - This fixture is essential for the pytest system as it provides the necessary scores for each test. + session_finish(agbenchmark_config) - Args: - request (Any): The request object from which the test class is retrieved. - Returns: - None: The scores are directly passed to the test function and do not need to be returned. - """ - test_class_name = request.node.cls.__name__ - return request.node.cls.scores.get(test_class_name) +def pytest_generate_tests(metafunc: pytest.Metafunc): + if type(n := metafunc.config.getoption("-N")) is str: + metafunc.parametrize("i_attempt", range(int(n))) -# this is adding the dependency marker and category markers automatically from the json -def pytest_collection_modifyitems(items: Any, config: Any) -> None: +def pytest_collection_modifyitems( + items: list[pytest.Item], config: pytest.Config +) -> None: """ - This function is a pytest hook that is called after the test collection has been performed. - It is used to modify the collected test items based on the agent benchmark configuration. - The function loads the agent benchmark configuration from the specified path and retrieves the regression reports. - For each test item, it checks if the test method exists and retrieves the dependencies and categories from the test class instance. - If the "--improve" or "--category" options are used, the dependencies are filtered based on the regression data. - If the "--test", "--no_dep", or "--maintain" options are used, the dependencies are cleared. - The function then dynamically adds the 'depends' and 'category' markers to the test item. - This function is essential for the pytest system as it provides the necessary modification of the test items based on the agent benchmark configuration. + Pytest hook that is called after initial test collection has been performed. + Modifies the collected test items based on the agent benchmark configuration, + adding the dependency marker and category markers. Args: - items (Any): The collected test items to be modified. - config (Any): The pytest configuration object from which the agent benchmark configuration path is retrieved. + items: The collected test items to be modified. + config: The active pytest configuration. """ - agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json") - try: - with open(agent_benchmark_config_path) as f: - agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - except json.JSONDecodeError: - print("Error: benchmark_config.json is not a valid JSON file.") - raise + rt_tracker = RegressionTestsTracker(agbenchmark_config.regression_tests_file) - regression_file = agent_benchmark_config.get_regression_reports_path() - data = ( - json.loads(open(regression_file, "r").read()) - if os.path.exists(regression_file) - else {} - ) - - print(f"data??? {data}") + try: + challenges_beaten_in_the_past = json.loads( + agbenchmark_config.challenges_already_beaten_file.read_bytes() + ) + except FileNotFoundError: + challenges_beaten_in_the_past = {} + + selected_tests: tuple[str] = config.getoption("--test") # type: ignore + selected_categories: tuple[str] = config.getoption("--category") # type: ignore + + # Can't use a for-loop to remove items in-place + i = 0 + while i < len(items): + item = items[i] + challenge = item.cls + challenge_name = item.cls.__name__ + + if not issubclass(challenge, BaseChallenge): + item.warn( + pytest.PytestCollectionWarning( + f"Non-challenge item collected: {challenge}" + ) + ) + i += 1 + continue - for item in items: - # Assuming item.cls is your test class - test_class_instance = item.cls() + # --test: remove the test from the set if it's not specifically selected + if selected_tests and challenge.info.name not in selected_tests: + items.remove(item) + continue - print(f"item: {item!r}") + # Filter challenges for --maintain, --improve, and --explore: + # --maintain -> only challenges expected to be passed (= regression tests) + # --improve -> only challenges that so far are not passed (reliably) + # --explore -> only challenges that have never been passed + is_regression_test = rt_tracker.has_regression_test(challenge.info.name) + has_been_passed = challenges_beaten_in_the_past.get(challenge.info.name, False) + if ( + (config.getoption("--maintain") and not is_regression_test) + or (config.getoption("--improve") and is_regression_test) + or (config.getoption("--explore") and has_been_passed) + ): + items.remove(item) + continue - if "test_method" not in item.name: + dependencies = challenge.info.dependencies + if ( + config.getoption("--test") + or config.getoption("--no-dep") + or config.getoption("--maintain") + ): + # Ignore dependencies: + # --test -> user selected specific tests to run, don't care about deps + # --no-dep -> ignore dependency relations regardless of test selection + # --maintain -> all "regression" tests must pass, so run all of them + dependencies = [] + elif config.getoption("--improve"): + # Filter dependencies, keep only deps that are not "regression" tests + dependencies = [ + d for d in dependencies if not rt_tracker.has_regression_test(d) + ] + + # Set category markers + challenge_categories = set(c.value for c in challenge.info.category) + for category in challenge_categories: + item.add_marker(category) + + # Enforce category selection + if selected_categories: + if not challenge_categories.intersection(set(selected_categories)): + items.remove(item) + continue + # # Filter dependencies, keep only deps from selected categories + # dependencies = [ + # d for d in dependencies + # if not set(d.categories).intersection(set(selected_categories)) + # ] + + # Skip items in optional categories that are not selected for the subject agent + challenge_optional_categories = challenge_categories & set(OPTIONAL_CATEGORIES) + if challenge_optional_categories and not ( + agbenchmark_config.categories + and challenge_optional_categories.issubset( + set(agbenchmark_config.categories) + ) + ): + logger.debug( + f"Skipping {challenge_name}: " + f"category {' and '.join(challenge_optional_categories)} is optional, " + "and not explicitly selected in the benchmark config." + ) + items.remove(item) continue - # Then you can access your properties - name = item.parent.cls.__name__ - # dependencies = test_class_instance.data.dependencies - - # Filter dependencies if they exist in regression data if its an improvement test - # if config.getoption("--improve") or config.getoption( - # "--category" - # ): - # dependencies = [dep for dep in dependencies if not data.get(dep, None)] - # if ( - # config.getoption("--test") - # or config.getoption("--no_dep") - # or config.getoption("--maintain") - # ): - print(f"test_class_instance: {test_class_instance!r}") - dependencies = test_class_instance.dependencies - - # Add depends marker dynamically - item.add_marker(pytest.mark.depends(on=dependencies, name=name)) - - categories = test_class_instance.data.category - - # Add category marker dynamically - for category in categories: - item.add_marker(getattr(pytest.mark, category)) + # Add marker for the DependencyManager + item.add_marker(pytest.mark.depends(on=dependencies, name=challenge_name)) + + i += 1 |