diff options
Diffstat (limited to 'benchmark/agbenchmark/utils/data_types.py')
-rw-r--r-- | benchmark/agbenchmark/utils/data_types.py | 152 |
1 files changed, 3 insertions, 149 deletions
diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py index 955b1d6a8..b38e5ef23 100644 --- a/benchmark/agbenchmark/utils/data_types.py +++ b/benchmark/agbenchmark/utils/data_types.py @@ -1,12 +1,8 @@ -import datetime -import json -import sys -from datetime import datetime from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional -from pydantic import BaseModel, constr, validator +from pydantic import BaseModel, Field, constr, validator class DifficultyLevel(Enum): @@ -33,80 +29,6 @@ DIFFICULTY_MAP = { STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel} -def calculate_info_test_path(base_path: Path, benchmark_start_time: datetime) -> Path: - """ - Calculates the path to the directory where the test report will be saved. - """ - # Ensure the reports path exists - base_path.mkdir(parents=True, exist_ok=True) - - # Get current UTC date-time stamp - date_stamp = benchmark_start_time.strftime("%Y%m%dT%H%M%S") - - # Default run name - run_name = "full_run" - - # Map command-line arguments to their respective labels - arg_labels = { - "--test": None, - "--category": None, - "--maintain": "maintain", - "--improve": "improve", - "--explore": "explore", - } - - # Identify the relevant command-line argument - for arg, label in arg_labels.items(): - if arg in sys.argv: - test_arg = sys.argv[sys.argv.index(arg) + 1] if label is None else None - run_name = arg.strip("--") - if test_arg: - run_name = f"{run_name}_{test_arg}" - break - - # Create the full new directory path with ISO standard UTC date-time stamp - report_path = base_path / f"{date_stamp}_{run_name}" - - # Ensure the new directory is created - report_path.mkdir(exist_ok=True) - return report_path - - -class AgentBenchmarkConfig(BaseModel): - """ - This class represents the configuration for the Agent agbenchmark. - It includes the following attributes: - - agent_benchmark_config_path: The path to the agent benchmark config that this object was created from. - - reports_folder: The path to the folder where the benchmark reports will be stored. - - host: The host where the benchmark is run. - """ - - agent_benchmark_config_path: Path | None = None - reports_folder: Path | None = None - host: str | None - - def get_reports_location(self) -> Path: - # if not self.reports_folder: - # self.reports_folder = ( - # Path(self.agent_benchmark_config_path).parent / "reports" - # ).resolve() - return Path.cwd() / "agbenchmark_config" / "reports" - - def get_reports_path(self, benchmark_start_time: datetime) -> Path: - return calculate_info_test_path( - self.get_reports_location(), benchmark_start_time - ) - - def get_regression_reports_path(self) -> Path: - return self.get_reports_location() / "regression_tests.json" - - def get_success_rate_path(self) -> Path: - return self.get_reports_location() / "success_rate.json" - - def get_agent_home_directory(self) -> Path: - return Path(self.agent_benchmark_config_path).resolve().parent - - class Info(BaseModel): difficulty: DifficultyLevel description: constr(regex=r"^Tests if the agent can.*") @@ -180,6 +102,7 @@ class Category(str, Enum): class ChallengeData(BaseModel): + eval_id: str = "" name: str category: List[Category] task: str @@ -189,73 +112,4 @@ class ChallengeData(BaseModel): info: Info | Dict[str, Info] metadata: Optional[Dict[str, Any]] = None - def serialize(self, path: str) -> None: - with open(path, "w") as file: - file.write(self.json()) - - def get_data(self) -> dict: - return self.dict() - - @staticmethod - def get_json_from_path(json_path: Path | str) -> dict: - path = Path(json_path).resolve() - with open(path, "r") as file: - data = json.load(file) - return data - - @staticmethod - def deserialize(path: str) -> "ChallengeData": - # this script is in root/agbenchmark/utils/define_task_types.py - script_dir = Path(__file__).resolve().parent.parent.parent - json_path = script_dir / Path(path) - - with open(json_path, "r") as file: - data = json.load(file) - try: - return ChallengeData(**data) - except: - test = "ok" - - def challenge_from_datum(self, file_datum: list[dict[str, Any]]) -> "ChallengeData": - same_task_data = { - "name": self.prefix, - "dependencies": self.dependencies, - "category": self.shared_category, - "task": self.task, - "cutoff": self.cutoff, - } - - if not self.info: - same_task_data["info"] = { - datum["name"]: datum["info"] for datum in file_datum - } - else: - same_task_data["info"] = self.info - - if not self.ground: - same_task_data["ground"] = { - datum["name"]: datum["ground"] for datum in file_datum - } - else: - same_task_data["ground"] = self.ground - - return ChallengeData(**same_task_data) - - def challenge_from_test_data(self, data: dict[str, Any]) -> "ChallengeData": - same_task_data = { - "name": data["name"], - "dependencies": data["dependencies"], - "category": data["category"], - "info": data["info"], - "ground": data["ground"], - } - - if self.same_task: - same_task_data["category"].extend(self.shared_category) - same_task_data["task"] = self.task - same_task_data["cutoff"] = self.cutoff - else: - same_task_data["task"] = data["task"] - same_task_data["cutoff"] = data["cutoff"] - - return ChallengeData(**same_task_data) + spec_file: Path | None = Field(None, exclude=True) |