feat(benchmark): Include Steps in Report

author: Reinier van der Leer <pwuts@agpt.co> 2024-02-19 17:08:24 +0100
committer: Reinier van der Leer <pwuts@agpt.co> 2024-02-19 17:08:24 +0100
commit: 3a170111293a72118e1f4cb28632af07faaae0ac (patch)
tree: bb52e157677b2c65e82c7990155f923fd7eeff93
parent: chore: Update `agbenchmark` dependency for agent and forge (diff)
download: Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.tar.gz
Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.tar.bz2
Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.zip
4 files changed, 16 insertions, 1 deletions
diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py
index 694d10184..71e61bad4 100644
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -10,7 +10,12 @@ from pathlib import Path
 from typing import Any, ClassVar, Iterator, Literal, Optional
 
 import pytest
-from agent_protocol_client import AgentApi, ApiClient, Configuration as ClientConfig
+from agent_protocol_client import (
+    AgentApi,
+    ApiClient,
+    Configuration as ClientConfig,
+    Step,
+)
 from colorama import Fore, Style
 from openai import _load_client as get_openai_client
 from pydantic import BaseModel, constr, Field, validator
@@ -176,6 +181,7 @@ class BuiltinChallenge(BaseChallenge):
         n_steps = 0
         timed_out = None
         agent_task_cost = None
+        steps: list[Step] = []
         try:
             async for step in self.run_challenge(
                 config, timeout, mock=request.config.getoption("--mock")
@@ -184,6 +190,7 @@ class BuiltinChallenge(BaseChallenge):
                     task_id = step.task_id
 
                 n_steps += 1
+                steps.append(step.copy())
                 if step.additional_output:
                     agent_task_cost = step.additional_output.get(
                         "task_total_cost",
@@ -192,6 +199,7 @@ class BuiltinChallenge(BaseChallenge):
             timed_out = False
         except TimeoutError:
             timed_out = True
+        request.node.user_properties.append(("steps", steps))
         request.node.user_properties.append(("n_steps", n_steps))
         request.node.user_properties.append(("timed_out", timed_out))
         request.node.user_properties.append(("agent_task_cost", agent_task_cost))
diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
index 2e51ab2be..9f44ac8f4 100644
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -396,6 +396,7 @@ class WebArenaChallenge(BaseChallenge):
         n_steps = 0
         timed_out = None
         agent_task_cost = None
+        steps: list[Step] = []
         eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
         try:
             async for step in self.run_challenge(
@@ -406,6 +407,7 @@ class WebArenaChallenge(BaseChallenge):
                     continue
 
                 n_steps += 1
+                steps.append(step)
                 if step.additional_output:
                     agent_task_cost = step.additional_output.get(
                         "task_total_cost",
@@ -429,6 +431,7 @@ class WebArenaChallenge(BaseChallenge):
             timed_out = False
         except TimeoutError:
             timed_out = True
+        request.node.user_properties.append(("steps", steps))
         request.node.user_properties.append(("n_steps", n_steps))
         request.node.user_properties.append(("timed_out", timed_out))
         request.node.user_properties.append(("agent_task_cost", agent_task_cost))
diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py
index 0475455a7..ea2ad840f 100644
--- a/benchmark/agbenchmark/reports/processing/report_types.py
+++ b/benchmark/agbenchmark/reports/processing/report_types.py
@@ -5,6 +5,7 @@ Model definitions used internally and for reports generated during command-line
 import logging
 from typing import Any, Dict, List
 
+from agent_protocol_client import Step
 from pydantic import BaseModel, Field, constr, validator
 
 datetime_format = r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+00:00$"
@@ -24,6 +25,8 @@ class TestResult(BaseModel):
     """Whether the run had to be stopped due to reaching the timeout"""
     n_steps: int | None = None
     """The number of steps executed by the agent"""
+    steps: list[Step] = []
+    """The steps generated by the agent"""
     cost: float | None = None
     """The (known) cost incurred by the run, e.g. from using paid LLM APIs"""
 
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index 2068e86f2..431f4ba6e 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -97,6 +97,7 @@ def add_test_result_to_report(
                 ),
                 reached_cutoff=user_properties.get("timed_out", False),
                 n_steps=user_properties.get("n_steps"),
+                steps=user_properties.get("steps", []),
                 cost=user_properties.get("agent_task_cost"),
             )
         )
author	Reinier van der Leer <pwuts@agpt.co>	2024-02-19 17:08:24 +0100
committer	Reinier van der Leer <pwuts@agpt.co>	2024-02-19 17:08:24 +0100
commit	3a170111293a72118e1f4cb28632af07faaae0ac (patch)
tree	bb52e157677b2c65e82c7990155f923fd7eeff93
parent	chore: Update `agbenchmark` dependency for agent and forge (diff)
download	Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.tar.gz Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.tar.bz2 Auto-GPT-3a170111293a72118e1f4cb28632af07faaae0ac.zip