diff options
Diffstat (limited to 'benchmark/agbenchmark/reports/processing/report_types.py')
-rw-r--r-- | benchmark/agbenchmark/reports/processing/report_types.py | 87 |
1 files changed, 87 insertions, 0 deletions
diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py new file mode 100644 index 000000000..b6deef021 --- /dev/null +++ b/benchmark/agbenchmark/reports/processing/report_types.py @@ -0,0 +1,87 @@ +""" +Model definitions used internally and for reports generated during command-line runs. +""" + +from typing import Any, Dict, List + +from pydantic import BaseModel, Field, constr, validator + +datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$" + + +class TestResult(BaseModel): + """Result details for a single run of a test/challenge.""" + + success: bool | None = None + """Whether the run was successful""" + run_time: str | None = None + """The (formatted) duration of the run""" + fail_reason: str | None = None + """If applicable, the reason why the run was not successful""" + reached_cutoff: bool | None = None # None if in progress + """Whether the run had to be stopped due to reaching the timeout""" + cost: float | None = None + """The (known) cost incurred by the run, e.g. from using paid LLM APIs""" + + @validator("fail_reason") + def success_xor_fail_reason(cls, v: str | None, values: dict[str, Any]): + if v: + success = values["success"] + assert not success, "fail_reason must only be specified if success=False" + else: + assert values["success"], "fail_reason is required if success=False" + return v + + +class TestMetrics(BaseModel): + """ + Result metrics for a set of runs for a test/challenge. Should be an aggregate of all + results for the same test/challenge within a benchmarking session. + """ + + attempted: bool + """Whether the challenge was attempted during this session""" + is_regression: bool + """Whether the challenge was considered a regression test at the time of running""" + success_percentage: float | None = Field(default=None, alias="success_%") + """Success rate (0-100) for this challenge within the session""" + + +class MetricsOverall(BaseModel): + """Global metrics concerning a benchmarking session""" + + run_time: str + """Duration from beginning to end of the session""" + highest_difficulty: str + """ + Difficulty of the most difficult challenge that succeeded at least once this session + """ + total_cost: float | None = None + """Total known cost of the session""" + + +class Test(BaseModel): + category: List[str] + difficulty: str | None + data_path: str + description: str + task: str + answer: str + metrics: TestMetrics + results: list[TestResult] + metadata: dict[str, Any] | None = Field(default_factory=dict) + + +class ReportBase(BaseModel): + command: str + completion_time: str | None = None + benchmark_start_time: constr(regex=datetime_format) + metrics: MetricsOverall + config: Dict[str, str | dict[str, str]] + agent_git_commit_sha: str | None = None + benchmark_git_commit_sha: str | None = None + repo_url: str | None = None + + +class Report(ReportBase): + tests: Dict[str, Test] |