aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-19 17:08:24 +0100
committerGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-19 17:08:24 +0100
commit3a170111293a72118e1f4cb28632af07faaae0ac (patch)
treebb52e157677b2c65e82c7990155f923fd7eeff93
parentchore: Update `agbenchmark` dependency for agent and forge (diff)
downloadAuto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.tar.gz
Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.tar.bz2
Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.zip
feat(benchmark): Include Steps in Report
-rw-r--r--benchmark/agbenchmark/challenges/builtin.py10
-rw-r--r--benchmark/agbenchmark/challenges/webarena.py3
-rw-r--r--benchmark/agbenchmark/reports/processing/report_types.py3
-rw-r--r--benchmark/agbenchmark/reports/reports.py1
4 files changed, 16 insertions, 1 deletions
diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py
index 694d10184..71e61bad4 100644
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -10,7 +10,12 @@ from pathlib import Path
from typing import Any, ClassVar, Iterator, Literal, Optional
import pytest
-from agent_protocol_client import AgentApi, ApiClient, Configuration as ClientConfig
+from agent_protocol_client import (
+ AgentApi,
+ ApiClient,
+ Configuration as ClientConfig,
+ Step,
+)
from colorama import Fore, Style
from openai import _load_client as get_openai_client
from pydantic import BaseModel, constr, Field, validator
@@ -176,6 +181,7 @@ class BuiltinChallenge(BaseChallenge):
n_steps = 0
timed_out = None
agent_task_cost = None
+ steps: list[Step] = []
try:
async for step in self.run_challenge(
config, timeout, mock=request.config.getoption("--mock")
@@ -184,6 +190,7 @@ class BuiltinChallenge(BaseChallenge):
task_id = step.task_id
n_steps += 1
+ steps.append(step.copy())
if step.additional_output:
agent_task_cost = step.additional_output.get(
"task_total_cost",
@@ -192,6 +199,7 @@ class BuiltinChallenge(BaseChallenge):
timed_out = False
except TimeoutError:
timed_out = True
+ request.node.user_properties.append(("steps", steps))
request.node.user_properties.append(("n_steps", n_steps))
request.node.user_properties.append(("timed_out", timed_out))
request.node.user_properties.append(("agent_task_cost", agent_task_cost))
diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
index 2e51ab2be..9f44ac8f4 100644
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -396,6 +396,7 @@ class WebArenaChallenge(BaseChallenge):
n_steps = 0
timed_out = None
agent_task_cost = None
+ steps: list[Step] = []
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
try:
async for step in self.run_challenge(
@@ -406,6 +407,7 @@ class WebArenaChallenge(BaseChallenge):
continue
n_steps += 1
+ steps.append(step)
if step.additional_output:
agent_task_cost = step.additional_output.get(
"task_total_cost",
@@ -429,6 +431,7 @@ class WebArenaChallenge(BaseChallenge):
timed_out = False
except TimeoutError:
timed_out = True
+ request.node.user_properties.append(("steps", steps))
request.node.user_properties.append(("n_steps", n_steps))
request.node.user_properties.append(("timed_out", timed_out))
request.node.user_properties.append(("agent_task_cost", agent_task_cost))
diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py
index 0475455a7..ea2ad840f 100644
--- a/benchmark/agbenchmark/reports/processing/report_types.py
+++ b/benchmark/agbenchmark/reports/processing/report_types.py
@@ -5,6 +5,7 @@ Model definitions used internally and for reports generated during command-line
import logging
from typing import Any, Dict, List
+from agent_protocol_client import Step
from pydantic import BaseModel, Field, constr, validator
datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"
@@ -24,6 +25,8 @@ class TestResult(BaseModel):
"""Whether the run had to be stopped due to reaching the timeout"""
n_steps: int | None = None
"""The number of steps executed by the agent"""
+ steps: list[Step] = []
+ """The steps generated by the agent"""
cost: float | None = None
"""The (known) cost incurred by the run, e.g. from using paid LLM APIs"""
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index 2068e86f2..431f4ba6e 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -97,6 +97,7 @@ def add_test_result_to_report(
),
reached_cutoff=user_properties.get("timed_out", False),
n_steps=user_properties.get("n_steps"),
+ steps=user_properties.get("steps", []),
cost=user_properties.get("agent_task_cost"),
)
)