From 21f1e64559ad3c292932ff1ea4d69abfe655280a Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <pwuts@agpt.co>
Date: Fri, 16 Feb 2024 18:10:46 +0100
Subject: feat(benchmark): Get agent task cost from `Step.additional_output`

---
 benchmark/agbenchmark/challenges/builtin.py  | 8 ++++++++
 benchmark/agbenchmark/challenges/webarena.py | 9 +++++++++
 benchmark/agbenchmark/reports/reports.py     | 1 +
 3 files changed, 18 insertions(+)

diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py
index 5b616e449..30bd50f8a 100644
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -175,18 +175,26 @@ class BuiltinChallenge(BaseChallenge):
         task_id = ""
         n_steps = 0
         timed_out = None
+        agent_task_cost = None
         try:
             async for step in self.run_challenge(
                 config, timeout, mock=request.config.getoption("--mock")
             ):
                 if not task_id:
                     task_id = step.task_id
+
                 n_steps += 1
+                if step.additional_output:
+                    agent_task_cost = step.additional_output.get(
+                        "task_total_cost",
+                        step.additional_output.get("task_cumulative_cost"),
+                    )
             timed_out = False
         except TimeoutError:
             timed_out = True
         request.node.user_properties.append(("n_steps", n_steps))
         request.node.user_properties.append(("timed_out", timed_out))
+        request.node.user_properties.append(("agent_task_cost", agent_task_cost))
 
         agent_client_config = ClientConfig(host=config.host)
         async with ApiClient(agent_client_config) as api_client:
diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
index 3cec1f956..d3a081e04 100644
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -395,6 +395,7 @@ class WebArenaChallenge(BaseChallenge):
 
         n_steps = 0
         timed_out = None
+        agent_task_cost = None
         eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
         try:
             async for step in self.run_challenge(
@@ -403,7 +404,14 @@ class WebArenaChallenge(BaseChallenge):
                 if not step.output:
                     logger.warn(f"Step has no output: {step}")
                     continue
+
                 n_steps += 1
+                if step.additional_output:
+                    agent_task_cost = step.additional_output.get(
+                        "task_total_cost",
+                        step.additional_output.get("task_cumulative_cost"),
+                    )
+
                 step_eval_results = self.evaluate_step_result(
                     step, mock=request.config.getoption("--mock")
                 )
@@ -423,6 +431,7 @@ class WebArenaChallenge(BaseChallenge):
             timed_out = True
         request.node.user_properties.append(("n_steps", n_steps))
         request.node.user_properties.append(("timed_out", timed_out))
+        request.node.user_properties.append(("agent_task_cost", agent_task_cost))
 
         # Get the column aggregate (highest score for each Eval)
         # from the matrix of EvalResults per step.
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index 4844f5bfe..3eb0c0011 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -93,6 +93,7 @@ def add_test_result_to_report(
             fail_reason=str(call.excinfo.value) if call.excinfo else None,
             reached_cutoff=user_properties.get("timed_out", False),
             n_steps=user_properties.get("n_steps"),
+            cost=user_properties.get("agent_task_cost"),
         )
     )
     test_report.metrics.success_percentage = (
-- 
cgit v1.2.3