aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-16 18:10:46 +0100
committerGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-16 18:10:46 +0100
commit21f1e64559ad3c292932ff1ea4d69abfe655280a (patch)
tree554269b322550ddb4a69e3ad32900121d8f5ea86
parentfeat(benchmark/report): Add and record `TestResult.n_steps` (diff)
downloadAuto-GPT-21f1e64559ad3c292932ff1ea4d69abfe655280a.tar.gz
Auto-GPT-21f1e64559ad3c292932ff1ea4d69abfe655280a.tar.bz2
Auto-GPT-21f1e64559ad3c292932ff1ea4d69abfe655280a.zip
feat(benchmark): Get agent task cost from `Step.additional_output`
-rw-r--r--benchmark/agbenchmark/challenges/builtin.py8
-rw-r--r--benchmark/agbenchmark/challenges/webarena.py9
-rw-r--r--benchmark/agbenchmark/reports/reports.py1
3 files changed, 18 insertions, 0 deletions
diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py
index 5b616e449..30bd50f8a 100644
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -175,18 +175,26 @@ class BuiltinChallenge(BaseChallenge):
task_id = ""
n_steps = 0
timed_out = None
+ agent_task_cost = None
try:
async for step in self.run_challenge(
config, timeout, mock=request.config.getoption("--mock")
):
if not task_id:
task_id = step.task_id
+
n_steps += 1
+ if step.additional_output:
+ agent_task_cost = step.additional_output.get(
+ "task_total_cost",
+ step.additional_output.get("task_cumulative_cost"),
+ )
timed_out = False
except TimeoutError:
timed_out = True
request.node.user_properties.append(("n_steps", n_steps))
request.node.user_properties.append(("timed_out", timed_out))
+ request.node.user_properties.append(("agent_task_cost", agent_task_cost))
agent_client_config = ClientConfig(host=config.host)
async with ApiClient(agent_client_config) as api_client:
diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
index 3cec1f956..d3a081e04 100644
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -395,6 +395,7 @@ class WebArenaChallenge(BaseChallenge):
n_steps = 0
timed_out = None
+ agent_task_cost = None
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
try:
async for step in self.run_challenge(
@@ -403,7 +404,14 @@ class WebArenaChallenge(BaseChallenge):
if not step.output:
logger.warn(f"Step has no output: {step}")
continue
+
n_steps += 1
+ if step.additional_output:
+ agent_task_cost = step.additional_output.get(
+ "task_total_cost",
+ step.additional_output.get("task_cumulative_cost"),
+ )
+
step_eval_results = self.evaluate_step_result(
step, mock=request.config.getoption("--mock")
)
@@ -423,6 +431,7 @@ class WebArenaChallenge(BaseChallenge):
timed_out = True
request.node.user_properties.append(("n_steps", n_steps))
request.node.user_properties.append(("timed_out", timed_out))
+ request.node.user_properties.append(("agent_task_cost", agent_task_cost))
# Get the column aggregate (highest score for each Eval)
# from the matrix of EvalResults per step.
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index 4844f5bfe..3eb0c0011 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -93,6 +93,7 @@ def add_test_result_to_report(
fail_reason=str(call.excinfo.value) if call.excinfo else None,
reached_cutoff=user_properties.get("timed_out", False),
n_steps=user_properties.get("n_steps"),
+ cost=user_properties.get("agent_task_cost"),
)
)
test_report.metrics.success_percentage = (