aboutsummaryrefslogtreecommitdiff
path: root/benchmark/agbenchmark/__main__.py
diff options
context:
space:
mode:
Diffstat (limited to 'benchmark/agbenchmark/__main__.py')
-rw-r--r--benchmark/agbenchmark/__main__.py351
1 files changed, 140 insertions, 211 deletions
diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py
index 76ca7529a..9fff53523 100644
--- a/benchmark/agbenchmark/__main__.py
+++ b/benchmark/agbenchmark/__main__.py
@@ -1,5 +1,4 @@
-import glob
-import json
+import logging
import os
import sys
from datetime import datetime, timezone
@@ -7,205 +6,100 @@ from pathlib import Path
from typing import Any, Optional
import click
-import pytest
-import toml
+from click_default_group import DefaultGroup
from dotenv import load_dotenv
-from helicone.lock import HeliconeLockManager
-from agbenchmark.app import app
-from agbenchmark.reports.ReportManager import SingletonReportManager
-from agbenchmark.utils.data_types import AgentBenchmarkConfig
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.logging import configure_logging
load_dotenv()
+# try:
+# if os.getenv("HELICONE_API_KEY"):
+# import helicone # noqa
+
+# helicone_enabled = True
+# else:
+# helicone_enabled = False
+# except ImportError:
+# helicone_enabled = False
+
+
+class InvalidInvocationError(ValueError):
+ pass
+
+
+logger = logging.getLogger(__name__)
+
BENCHMARK_START_TIME_DT = datetime.now(timezone.utc)
BENCHMARK_START_TIME = BENCHMARK_START_TIME_DT.strftime("%Y-%m-%dT%H:%M:%S+00:00")
-TEMP_FOLDER_ABS_PATH = Path.cwd() / "agbenchmark_config" / "temp_folder"
-CHALLENGES_ALREADY_BEATEN = (
- Path.cwd() / "agbenchmark_config" / "challenges_already_beaten.json"
-)
-UPDATES_JSON_PATH = Path.cwd() / "agbenchmark_config" / "updates.json"
-if os.environ.get("HELICONE_API_KEY"):
- HeliconeLockManager.write_custom_property(
- "benchmark_start_time", BENCHMARK_START_TIME
- )
+# if helicone_enabled:
+# from helicone.lock import HeliconeLockManager
-with open(
- Path(__file__).resolve().parent / "challenges" / "optional_categories.json"
-) as f:
- OPTIONAL_CATEGORIES = json.load(f)["optional_categories"]
-
-
-def get_unique_categories() -> set[str]:
- """Find all data.json files in the directory relative to this file and its subdirectories,
- read the "category" field from each file, and return a set of unique categories."""
- categories = set()
-
- # Get the directory of this file
- this_dir = os.path.dirname(os.path.abspath(__file__))
-
- glob_path = os.path.join(this_dir, "./challenges/**/data.json")
- # Use it as the base for the glob pattern
- for data_file in glob.glob(glob_path, recursive=True):
- with open(data_file, "r") as f:
- try:
- data = json.load(f)
- categories.update(data.get("category", []))
- except json.JSONDecodeError:
- print(f"Error: {data_file} is not a valid JSON file.")
- continue
- except IOError:
- print(f"IOError: file could not be read: {data_file}")
- continue
-
- return categories
-
-
-def run_benchmark(
- maintain: bool = False,
- improve: bool = False,
- explore: bool = False,
- mock: bool = False,
- no_dep: bool = False,
- nc: bool = False,
- keep_answers: bool = False,
- category: Optional[tuple[str]] = None,
- skip_category: Optional[tuple[str]] = None,
- test: Optional[str] = None,
- cutoff: Optional[int] = None,
- server: bool = False,
-) -> int:
- """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
- # Check if configuration file exists and is not empty
-
- initialize_updates_file()
- SingletonReportManager()
- agent_benchmark_config_path = str(Path.cwd() / "agbenchmark_config" / "config.json")
- try:
- with open(agent_benchmark_config_path, "r") as f:
- agent_benchmark_config = AgentBenchmarkConfig(**json.load(f))
- agent_benchmark_config.agent_benchmark_config_path = (
- agent_benchmark_config_path
- )
- except json.JSONDecodeError:
- print("Error: benchmark_config.json is not a valid JSON file.")
- return 1
+# HeliconeLockManager.write_custom_property(
+# "benchmark_start_time", BENCHMARK_START_TIME
+# )
- if maintain and improve and explore:
- print(
- "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
- )
- return 1
- if test and (category or skip_category or maintain or improve or explore):
- print(
- "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
- )
- return 1
-
- assert agent_benchmark_config.host, "Error: host needs to be added to the config."
+@click.group(cls=DefaultGroup, default_if_no_args=True)
+@click.option("--debug", is_flag=True, help="Enable debug output")
+def cli(
+ debug: bool,
+) -> Any:
+ configure_logging(logging.DEBUG if debug else logging.INFO)
- print("Current configuration:")
- for key, value in vars(agent_benchmark_config).items():
- print(f"{key}: {value}")
- pytest_args = ["-vs"]
- if keep_answers:
- pytest_args.append("--keep-answers")
+@cli.command(hidden=True)
+def start():
+ raise DeprecationWarning(
+ "`agbenchmark start` is deprecated. Use `agbenchmark run` instead."
+ )
- if test:
- print("Running specific test:", test)
- else:
- # Categories that are used in the challenges
- categories = get_unique_categories()
- if category:
- invalid_categories = set(category) - categories
- assert (
- not invalid_categories
- ), f"Invalid categories: {invalid_categories}. Valid categories are: {categories}"
-
- if category:
- categories_to_run = set(category)
- if skip_category:
- categories_to_run = categories_to_run.difference(set(skip_category))
- assert categories_to_run, "Error: You can't skip all categories"
- pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
- print("Running tests of category:", categories_to_run)
- elif skip_category:
- categories_to_run = categories - set(skip_category)
- assert categories_to_run, "Error: You can't skip all categories"
- pytest_args.extend(["-m", " or ".join(categories_to_run), "--category"])
- print("Running tests of category:", categories_to_run)
- else:
- print("Running all categories")
-
- if maintain:
- print("Running only regression tests")
- pytest_args.append("--maintain")
- elif improve:
- print("Running only non-regression tests")
- pytest_args.append("--improve")
- elif explore:
- print("Only attempt challenges that have never been beaten")
- pytest_args.append("--explore")
-
- if mock:
- pytest_args.append("--mock")
- os.environ[
- "IS_MOCK"
- ] = "True" # ugly hack to make the mock work when calling from API
-
- if no_dep:
- pytest_args.append("--no_dep")
-
- if nc and cutoff:
- print(
- "Error: You can't use both --nc and --cutoff at the same time. Please choose one."
- )
- return 1
- if nc:
- pytest_args.append("--nc")
- if cutoff:
- pytest_args.append("--cutoff")
- print(f"Setting cuttoff override to {cutoff} seconds.")
- current_dir = Path(__file__).resolve().parent
- print(f"Current directory: {current_dir}")
- pytest_args.extend((str(current_dir), "--cache-clear"))
- exit_code = pytest.main(pytest_args)
- SingletonReportManager().clear_instance()
-
-
-@click.group(invoke_without_command=True)
-@click.option("--backend", is_flag=True, help="If it's being run from the cli")
-@click.option("-c", "--category", multiple=True, help="Specific category to run")
+@cli.command(default=True)
+@click.option(
+ "-N", "--attempts", default=1, help="Number of times to run each challenge."
+)
+@click.option(
+ "-c",
+ "--category",
+ multiple=True,
+ help="(+) Select a category to run.",
+)
@click.option(
"-s",
"--skip-category",
multiple=True,
- help="Skips preventing the tests from this category from running",
+ help="(+) Exclude a category from running.",
)
-@click.option("--test", multiple=True, help="Specific test to run")
-@click.option("--maintain", is_flag=True, help="Runs only regression tests")
-@click.option("--improve", is_flag=True, help="Run only non-regression tests")
+@click.option("--test", multiple=True, help="(+) Select a test to run.")
+@click.option("--maintain", is_flag=True, help="Run only regression tests.")
+@click.option("--improve", is_flag=True, help="Run only non-regression tests.")
@click.option(
"--explore",
is_flag=True,
- help="Only attempt challenges that have never been beaten",
+ help="Run only challenges that have never been beaten.",
)
-@click.option("--mock", is_flag=True, help="Run with mock")
@click.option(
- "--no_dep",
+ "--no-dep",
is_flag=True,
- help="Run without dependencies",
+ help="Run all (selected) challenges, regardless of dependency success/failure.",
)
-@click.option("--nc", is_flag=True, help="Run without cutoff")
+@click.option("--cutoff", type=int, help="Override the challenge time limit (seconds).")
+@click.option("--nc", is_flag=True, help="Disable the challenge time limit.")
+@click.option("--mock", is_flag=True, help="Run with mock")
@click.option("--keep-answers", is_flag=True, help="Keep answers")
-@click.option("--cutoff", help="Set or override tests cutoff (seconds)")
-@click.argument("value", type=str, required=False)
-def cli(
+@click.option(
+ "--backend",
+ is_flag=True,
+ help="Write log output to a file instead of the terminal.",
+)
+# @click.argument(
+# "agent_path", type=click.Path(exists=True, file_okay=False), required=False
+# )
+def run(
maintain: bool,
improve: bool,
explore: bool,
@@ -213,18 +107,38 @@ def cli(
no_dep: bool,
nc: bool,
keep_answers: bool,
- category: Optional[list[str]] = None,
- skip_category: Optional[list[str]] = None,
- test: Optional[str] = None,
+ test: tuple[str],
+ category: tuple[str],
+ skip_category: tuple[str],
+ attempts: int,
cutoff: Optional[int] = None,
backend: Optional[bool] = False,
- value: Optional[str] = None,
-) -> Any:
- # Redirect stdout if backend is True
- if value == "start":
- raise ("`agbenchmark start` is removed. Run `agbenchmark` instead.")
- if value == "serve":
- return serve()
+ # agent_path: Optional[Path] = None,
+) -> None:
+ """
+ Run the benchmark on the agent in the current directory.
+
+ Options marked with (+) can be specified multiple times, to select multiple items.
+ """
+ from agbenchmark.main import run_benchmark, validate_args
+
+ agbenchmark_config = AgentBenchmarkConfig.load()
+ logger.debug(f"agbenchmark_config: {agbenchmark_config.agbenchmark_config_dir}")
+ try:
+ validate_args(
+ maintain=maintain,
+ improve=improve,
+ explore=explore,
+ tests=test,
+ categories=category,
+ skip_categories=skip_category,
+ no_cutoff=nc,
+ cutoff=cutoff,
+ )
+ except InvalidInvocationError as e:
+ logger.error("Error: " + "\n".join(e.args))
+ sys.exit(1)
+
original_stdout = sys.stdout # Save the original standard output
exit_code = None
@@ -232,16 +146,18 @@ def cli(
with open("backend/backend_stdout.txt", "w") as f:
sys.stdout = f
exit_code = run_benchmark(
+ config=agbenchmark_config,
maintain=maintain,
improve=improve,
explore=explore,
mock=mock,
no_dep=no_dep,
- nc=nc,
+ no_cutoff=nc,
keep_answers=keep_answers,
- category=category,
- skip_category=skip_category,
- test=test,
+ tests=test,
+ categories=category,
+ skip_categories=skip_category,
+ attempts_per_challenge=attempts,
cutoff=cutoff,
)
@@ -249,16 +165,18 @@ def cli(
else:
exit_code = run_benchmark(
+ config=agbenchmark_config,
maintain=maintain,
improve=improve,
explore=explore,
mock=mock,
no_dep=no_dep,
- nc=nc,
+ no_cutoff=nc,
keep_answers=keep_answers,
- category=category,
- skip_category=skip_category,
- test=test,
+ tests=test,
+ categories=category,
+ skip_categories=skip_category,
+ attempts_per_challenge=attempts,
cutoff=cutoff,
)
@@ -266,33 +184,44 @@ def cli(
@cli.command()
-def version():
- """Print the version of the benchmark tool."""
- current_directory = Path(__file__).resolve().parent
- version = toml.load(current_directory / ".." / "pyproject.toml")["tool"]["poetry"][
- "version"
- ]
- print(f"Benchmark Tool Version {version}")
+@click.option("--port", type=int, help="Port to run the API on.")
+def serve(port: Optional[int] = None):
+ """Serve the benchmark frontend and API on port 8080."""
+ import uvicorn
+ from agbenchmark.app import setup_fastapi_app
-def serve():
- import uvicorn
+ config = AgentBenchmarkConfig.load()
+ app = setup_fastapi_app(config)
# Run the FastAPI application using uvicorn
- uvicorn.run(app, host="0.0.0.0", port=8080)
+ port = port or int(os.getenv("PORT", 8080))
+ uvicorn.run(app, host="0.0.0.0", port=port)
-def initialize_updates_file():
- if os.path.exists(UPDATES_JSON_PATH):
- # If the file already exists, overwrite it with an empty list
- with open(UPDATES_JSON_PATH, "w") as file:
- json.dump([], file, indent=2)
- print("Initialized updates.json by overwriting with an empty array")
- else:
- # If the file doesn't exist, create it and write an empty list
- with open(UPDATES_JSON_PATH, "w") as file:
- json.dump([], file, indent=2)
- print("Created updates.json and initialized it with an empty array")
+@cli.command()
+def config():
+ """Displays info regarding the present AGBenchmark config."""
+ try:
+ config = AgentBenchmarkConfig.load()
+ except FileNotFoundError as e:
+ click.echo(e, err=True)
+ return 1
+
+ k_col_width = max(len(k) for k in config.dict().keys())
+ for k, v in config.dict().items():
+ click.echo(f"{k: <{k_col_width}} = {v}")
+
+
+@cli.command()
+def version():
+ """Print version info for the AGBenchmark application."""
+ import toml
+
+ package_root = Path(__file__).resolve().parent.parent
+ pyproject = toml.load(package_root / "pyproject.toml")
+ version = pyproject["tool"]["poetry"]["version"]
+ click.echo(f"AGBenchmark version {version}")
if __name__ == "__main__":