diff options
Diffstat (limited to 'benchmark/agbenchmark/__main__.py')
-rw-r--r-- | benchmark/agbenchmark/__main__.py | 351 |
1 files changed, 140 insertions, 211 deletions
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py index 76ca7529a..9fff53523 100644 --- a/benchmark/agbenchmark/__main__.py +++ b/benchmark/agbenchmark/__main__.py @@ -1,5 +1,4 @@ -import glob -import json +import logging import os import sys from datetime import datetime, timezone @@ -7,205 +6,100 @@ from pathlib import Path from typing import Any, Optional import click -import pytest -import toml +from click_default_group import DefaultGroup from dotenv import load_dotenv -from helicone.lock import HeliconeLockManager -from agbenchmark.app import app -from agbenchmark.reports.ReportManager import SingletonReportManager -from agbenchmark.utils.data_types import AgentBenchmarkConfig +from agbenchmark.config import AgentBenchmarkConfig +from agbenchmark.utils.logging import configure_logging load_dotenv() +# try: +# if os.getenv("HELICONE_API_KEY"): +# import helicone # noqa + +# helicone_enabled = True +# else: +# helicone_enabled = False +# except ImportError: +# helicone_enabled = False + + +class InvalidInvocationError(ValueError): + pass + + +logger = logging.getLogger(__name__) + BENCHMARK_START_TIME_DT = datetime.now(timezone.utc) BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00") -TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder" -CHALLENGES_ALREADY_BEATEN = ( - Path.cwd() / "agbenchmark_config" / "challenges_already_beaten.json" -) -UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json" -if os.environ.get("HELICONE_API_KEY"): - HeliconeLockManager.write_custom_property( - "benchmark_start_time", BENCHMARK_START_TIME - ) +# if helicone_enabled: +# from helicone.lock import HeliconeLockManager -with open( - Path(__file__).resolve().parent / "challenges" / "optional_categories.json" -) as f: - OPTIONAL_CATEGORIES = json.load(f)["optional_categories"] - - -def get_unique_categories() -> set[str]: - """Find all data.json files in the directory relative to this file and its subdirectories, - read the "category" field from each file, and return a set of unique categories.""" - categories = set() - - # Get the directory of this file - this_dir = os.path.dirname(os.path.abspath(__file__)) - - glob_path = os.path.join(this_dir, "./challenges/**/data.json") - # Use it as the base for the glob pattern - for data_file in glob.glob(glob_path, recursive=True): - with open(data_file, "r") as f: - try: - data = json.load(f) - categories.update(data.get("category", [])) - except json.JSONDecodeError: - print(f"Error: {data_file} is not a valid JSON file.") - continue - except IOError: - print(f"IOError: file could not be read: {data_file}") - continue - - return categories - - -def run_benchmark( - maintain: bool = False, - improve: bool = False, - explore: bool = False, - mock: bool = False, - no_dep: bool = False, - nc: bool = False, - keep_answers: bool = False, - category: Optional[tuple[str]] = None, - skip_category: Optional[tuple[str]] = None, - test: Optional[str] = None, - cutoff: Optional[int] = None, - server: bool = False, -) -> int: - """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" - # Check if configuration file exists and is not empty - - initialize_updates_file() - SingletonReportManager() - agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json") - try: - with open(agent_benchmark_config_path, "r") as f: - agent_benchmark_config = AgentBenchmarkConfig(**json.load(f)) - agent_benchmark_config.agent_benchmark_config_path = ( - agent_benchmark_config_path - ) - except json.JSONDecodeError: - print("Error: benchmark_config.json is not a valid JSON file.") - return 1 +# HeliconeLockManager.write_custom_property( +# "benchmark_start_time", BENCHMARK_START_TIME +# ) - if maintain and improve and explore: - print( - "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one." - ) - return 1 - if test and (category or skip_category or maintain or improve or explore): - print( - "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test." - ) - return 1 - - assert agent_benchmark_config.host, "Error: host needs to be added to the config." +@click.group(cls=DefaultGroup, default_if_no_args=True) +@click.option("--debug", is_flag=True, help="Enable debug output") +def cli( + debug: bool, +) -> Any: + configure_logging(logging.DEBUG if debug else logging.INFO) - print("Current configuration:") - for key, value in vars(agent_benchmark_config).items(): - print(f"{key}: {value}") - pytest_args = ["-vs"] - if keep_answers: - pytest_args.append("--keep-answers") +@cli.command(hidden=True) +def start(): + raise DeprecationWarning( + "`agbenchmark start` is deprecated. Use `agbenchmark run` instead." + ) - if test: - print("Running specific test:", test) - else: - # Categories that are used in the challenges - categories = get_unique_categories() - if category: - invalid_categories = set(category) - categories - assert ( - not invalid_categories - ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}" - - if category: - categories_to_run = set(category) - if skip_category: - categories_to_run = categories_to_run.difference(set(skip_category)) - assert categories_to_run, "Error: You can't skip all categories" - pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"]) - print("Running tests of category:", categories_to_run) - elif skip_category: - categories_to_run = categories - set(skip_category) - assert categories_to_run, "Error: You can't skip all categories" - pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"]) - print("Running tests of category:", categories_to_run) - else: - print("Running all categories") - - if maintain: - print("Running only regression tests") - pytest_args.append("--maintain") - elif improve: - print("Running only non-regression tests") - pytest_args.append("--improve") - elif explore: - print("Only attempt challenges that have never been beaten") - pytest_args.append("--explore") - - if mock: - pytest_args.append("--mock") - os.environ[ - "IS_MOCK" - ] = "True" # ugly hack to make the mock work when calling from API - - if no_dep: - pytest_args.append("--no_dep") - - if nc and cutoff: - print( - "Error: You can't use both --nc and --cutoff at the same time. Please choose one." - ) - return 1 - if nc: - pytest_args.append("--nc") - if cutoff: - pytest_args.append("--cutoff") - print(f"Setting cuttoff override to {cutoff} seconds.") - current_dir = Path(__file__).resolve().parent - print(f"Current directory: {current_dir}") - pytest_args.extend((str(current_dir), "--cache-clear")) - exit_code = pytest.main(pytest_args) - SingletonReportManager().clear_instance() - - -@click.group(invoke_without_command=True) -@click.option("--backend", is_flag=True, help="If it's being run from the cli") -@click.option("-c", "--category", multiple=True, help="Specific category to run") +@cli.command(default=True) +@click.option( + "-N", "--attempts", default=1, help="Number of times to run each challenge." +) +@click.option( + "-c", + "--category", + multiple=True, + help="(+) Select a category to run.", +) @click.option( "-s", "--skip-category", multiple=True, - help="Skips preventing the tests from this category from running", + help="(+) Exclude a category from running.", ) -@click.option("--test", multiple=True, help="Specific test to run") -@click.option("--maintain", is_flag=True, help="Runs only regression tests") -@click.option("--improve", is_flag=True, help="Run only non-regression tests") +@click.option("--test", multiple=True, help="(+) Select a test to run.") +@click.option("--maintain", is_flag=True, help="Run only regression tests.") +@click.option("--improve", is_flag=True, help="Run only non-regression tests.") @click.option( "--explore", is_flag=True, - help="Only attempt challenges that have never been beaten", + help="Run only challenges that have never been beaten.", ) -@click.option("--mock", is_flag=True, help="Run with mock") @click.option( - "--no_dep", + "--no-dep", is_flag=True, - help="Run without dependencies", + help="Run all (selected) challenges, regardless of dependency success/failure.", ) -@click.option("--nc", is_flag=True, help="Run without cutoff") +@click.option("--cutoff", type=int, help="Override the challenge time limit (seconds).") +@click.option("--nc", is_flag=True, help="Disable the challenge time limit.") +@click.option("--mock", is_flag=True, help="Run with mock") @click.option("--keep-answers", is_flag=True, help="Keep answers") -@click.option("--cutoff", help="Set or override tests cutoff (seconds)") -@click.argument("value", type=str, required=False) -def cli( +@click.option( + "--backend", + is_flag=True, + help="Write log output to a file instead of the terminal.", +) +# @click.argument( +# "agent_path", type=click.Path(exists=True, file_okay=False), required=False +# ) +def run( maintain: bool, improve: bool, explore: bool, @@ -213,18 +107,38 @@ def cli( no_dep: bool, nc: bool, keep_answers: bool, - category: Optional[list[str]] = None, - skip_category: Optional[list[str]] = None, - test: Optional[str] = None, + test: tuple[str], + category: tuple[str], + skip_category: tuple[str], + attempts: int, cutoff: Optional[int] = None, backend: Optional[bool] = False, - value: Optional[str] = None, -) -> Any: - # Redirect stdout if backend is True - if value == "start": - raise ("`agbenchmark start` is removed. Run `agbenchmark` instead.") - if value == "serve": - return serve() + # agent_path: Optional[Path] = None, +) -> None: + """ + Run the benchmark on the agent in the current directory. + + Options marked with (+) can be specified multiple times, to select multiple items. + """ + from agbenchmark.main import run_benchmark, validate_args + + agbenchmark_config = AgentBenchmarkConfig.load() + logger.debug(f"agbenchmark_config: {agbenchmark_config.agbenchmark_config_dir}") + try: + validate_args( + maintain=maintain, + improve=improve, + explore=explore, + tests=test, + categories=category, + skip_categories=skip_category, + no_cutoff=nc, + cutoff=cutoff, + ) + except InvalidInvocationError as e: + logger.error("Error: " + "\n".join(e.args)) + sys.exit(1) + original_stdout = sys.stdout # Save the original standard output exit_code = None @@ -232,16 +146,18 @@ def cli( with open("backend/backend_stdout.txt", "w") as f: sys.stdout = f exit_code = run_benchmark( + config=agbenchmark_config, maintain=maintain, improve=improve, explore=explore, mock=mock, no_dep=no_dep, - nc=nc, + no_cutoff=nc, keep_answers=keep_answers, - category=category, - skip_category=skip_category, - test=test, + tests=test, + categories=category, + skip_categories=skip_category, + attempts_per_challenge=attempts, cutoff=cutoff, ) @@ -249,16 +165,18 @@ def cli( else: exit_code = run_benchmark( + config=agbenchmark_config, maintain=maintain, improve=improve, explore=explore, mock=mock, no_dep=no_dep, - nc=nc, + no_cutoff=nc, keep_answers=keep_answers, - category=category, - skip_category=skip_category, - test=test, + tests=test, + categories=category, + skip_categories=skip_category, + attempts_per_challenge=attempts, cutoff=cutoff, ) @@ -266,33 +184,44 @@ def cli( @cli.command() -def version(): - """Print the version of the benchmark tool.""" - current_directory = Path(__file__).resolve().parent - version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][ - "version" - ] - print(f"Benchmark Tool Version {version}") +@click.option("--port", type=int, help="Port to run the API on.") +def serve(port: Optional[int] = None): + """Serve the benchmark frontend and API on port 8080.""" + import uvicorn + from agbenchmark.app import setup_fastapi_app -def serve(): - import uvicorn + config = AgentBenchmarkConfig.load() + app = setup_fastapi_app(config) # Run the FastAPI application using uvicorn - uvicorn.run(app, host="0.0.0.0", port=8080) + port = port or int(os.getenv("PORT", 8080)) + uvicorn.run(app, host="0.0.0.0", port=port) -def initialize_updates_file(): - if os.path.exists(UPDATES_JSON_PATH): - # If the file already exists, overwrite it with an empty list - with open(UPDATES_JSON_PATH, "w") as file: - json.dump([], file, indent=2) - print("Initialized updates.json by overwriting with an empty array") - else: - # If the file doesn't exist, create it and write an empty list - with open(UPDATES_JSON_PATH, "w") as file: - json.dump([], file, indent=2) - print("Created updates.json and initialized it with an empty array") +@cli.command() +def config(): + """Displays info regarding the present AGBenchmark config.""" + try: + config = AgentBenchmarkConfig.load() + except FileNotFoundError as e: + click.echo(e, err=True) + return 1 + + k_col_width = max(len(k) for k in config.dict().keys()) + for k, v in config.dict().items(): + click.echo(f"{k: <{k_col_width}} = {v}") + + +@cli.command() +def version(): + """Print version info for the AGBenchmark application.""" + import toml + + package_root = Path(__file__).resolve().parent.parent + pyproject = toml.load(package_root / "pyproject.toml") + version = pyproject["tool"]["poetry"]["version"] + click.echo(f"AGBenchmark version {version}") if __name__ == "__main__": |