From 21f1e64559ad3c292932ff1ea4d69abfe655280a Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Fri, 16 Feb 2024 18:10:46 +0100 Subject: feat(benchmark): Get agent task cost from `Step.additional_output` --- benchmark/agbenchmark/challenges/builtin.py | 8 ++++++++ benchmark/agbenchmark/challenges/webarena.py | 9 +++++++++ benchmark/agbenchmark/reports/reports.py | 1 + 3 files changed, 18 insertions(+) diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py index 5b616e449..30bd50f8a 100644 --- a/benchmark/agbenchmark/challenges/builtin.py +++ b/benchmark/agbenchmark/challenges/builtin.py @@ -175,18 +175,26 @@ class BuiltinChallenge(BaseChallenge): task_id = "" n_steps = 0 timed_out = None + agent_task_cost = None try: async for step in self.run_challenge( config, timeout, mock=request.config.getoption("--mock") ): if not task_id: task_id = step.task_id + n_steps += 1 + if step.additional_output: + agent_task_cost = step.additional_output.get( + "task_total_cost", + step.additional_output.get("task_cumulative_cost"), + ) timed_out = False except TimeoutError: timed_out = True request.node.user_properties.append(("n_steps", n_steps)) request.node.user_properties.append(("timed_out", timed_out)) + request.node.user_properties.append(("agent_task_cost", agent_task_cost)) agent_client_config = ClientConfig(host=config.host) async with ApiClient(agent_client_config) as api_client: diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py index 3cec1f956..d3a081e04 100644 --- a/benchmark/agbenchmark/challenges/webarena.py +++ b/benchmark/agbenchmark/challenges/webarena.py @@ -395,6 +395,7 @@ class WebArenaChallenge(BaseChallenge): n_steps = 0 timed_out = None + agent_task_cost = None eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = [] try: async for step in self.run_challenge( @@ -403,7 +404,14 @@ class WebArenaChallenge(BaseChallenge): if not step.output: logger.warn(f"Step has no output: {step}") continue + n_steps += 1 + if step.additional_output: + agent_task_cost = step.additional_output.get( + "task_total_cost", + step.additional_output.get("task_cumulative_cost"), + ) + step_eval_results = self.evaluate_step_result( step, mock=request.config.getoption("--mock") ) @@ -423,6 +431,7 @@ class WebArenaChallenge(BaseChallenge): timed_out = True request.node.user_properties.append(("n_steps", n_steps)) request.node.user_properties.append(("timed_out", timed_out)) + request.node.user_properties.append(("agent_task_cost", agent_task_cost)) # Get the column aggregate (highest score for each Eval) # from the matrix of EvalResults per step. diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py index 4844f5bfe..3eb0c0011 100644 --- a/benchmark/agbenchmark/reports/reports.py +++ b/benchmark/agbenchmark/reports/reports.py @@ -93,6 +93,7 @@ def add_test_result_to_report( fail_reason=str(call.excinfo.value) if call.excinfo else None, reached_cutoff=user_properties.get("timed_out", False), n_steps=user_properties.get("n_steps"), + cost=user_properties.get("agent_task_cost"), ) ) test_report.metrics.success_percentage = ( -- cgit v1.2.3