fix(benchmark): Mock mode, python evals, `--attempts` flag, challenge definitions

- Fixed `--mock` mode - Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step. - Implemented mock mode in `WebArenaChallenge` - Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted - Fixed handling of `python`/`pytest` evals in `BuiltinChallenge` - Disabled left-over Helicone code (see 056163e) - Fixed a couple of challenge definitions - WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*) - synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list - Added some debug logging in agent_api_interface.py and challenges/builtin.py
author: Reinier van der Leer <pwuts@agpt.co> 2024-02-14 01:05:34 +0100
committer: Reinier van der Leer <pwuts@agpt.co> 2024-02-14 01:05:34 +0100
commit: 327fb1f9166d389c434adbc44720241f46244fa8 (patch)
tree: 7f02973051245fe291f937189c82f398b63cc240 /benchmark
parent: fix(agent/text_processing): Fix `extract_information` LLM response parsing (diff)
download: Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.gz
Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.bz2
Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.zip
6 files changed, 63 insertions, 44 deletions
diff --git a/benchmark/agbenchmark/agent_api_interface.py b/benchmark/agbenchmark/agent_api_interface.py
index 6eadcc537..e500f494f 100644
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -23,6 +23,8 @@ async def run_api_agent(
     config: AgentBenchmarkConfig,
     timeout: int,
     artifacts_location: Optional[Path] = None,
+    *,
+    mock: bool = False,
 ) -> AsyncIterator[Step]:
     configuration = Configuration(host=config.host)
     async with ApiClient(configuration) as api_client:
@@ -36,26 +38,32 @@ async def run_api_agent(
         task_id = response.task_id
 
         if artifacts_location:
+            logger.debug("Uploading task input artifacts to agent...")
             await upload_artifacts(
                 api_instance, artifacts_location, task_id, "artifacts_in"
             )
 
+        logger.debug("Running agent until finished or timeout...")
         while True:
             step = await api_instance.execute_agent_task_step(task_id=task_id)
             yield step
 
             if time.time() - start_time > timeout:
                 raise TimeoutError("Time limit exceeded")
+            if step and mock:
+                step.is_last = True
             if not step or step.is_last:
                 break
 
         if artifacts_location:
             # In "mock" mode, we cheat by giving the correct artifacts to pass the test
-            if os.getenv("IS_MOCK"):
+            if mock:
+                logger.debug("Uploading mock artifacts to agent...")
                 await upload_artifacts(
                     api_instance, artifacts_location, task_id, "artifacts_out"
                 )
 
+            logger.debug("Downloading agent artifacts...")
             await download_agent_artifacts_into_folder(
                 api_instance, task_id, config.temp_folder
             )
diff --git a/benchmark/agbenchmark/challenges/base.py b/benchmark/agbenchmark/challenges/base.py
index 64ead3a9d..4fe73a2d7 100644
--- a/benchmark/agbenchmark/challenges/base.py
+++ b/benchmark/agbenchmark/challenges/base.py
@@ -60,7 +60,7 @@ class BaseChallenge(ABC):
 
     @classmethod
     async def run_challenge(
-        cls, config: AgentBenchmarkConfig, timeout: int
+        cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
     ) -> AsyncIterator[Step]:
         """
         Runs the challenge on the subject agent with the specified timeout.
@@ -89,7 +89,7 @@ class BaseChallenge(ABC):
         logger.debug(f"Starting {cls.info.name} challenge run")
         i = 0
         async for step in run_api_agent(
-            cls.info.task, config, timeout, cls.info.task_artifacts_dir
+            cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
         ):
             i += 1
             print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py
index 590696688..fd28dc3ee 100644
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge):
         self,
         config: AgentBenchmarkConfig,
         request: pytest.FixtureRequest,
-        i_attempt: int,
+        i_attempt: int = 0,
     ) -> None:
-        if os.environ.get("HELICONE_API_KEY"):
-            from helicone.lock import HeliconeLockManager
+        # if os.environ.get("HELICONE_API_KEY"):
+        #     from helicone.lock import HeliconeLockManager
 
-            HeliconeLockManager.write_custom_property("challenge", self.info.name)
+        #     HeliconeLockManager.write_custom_property("challenge", self.info.name)
 
         timeout = self._spec.cutoff or 60
 
@@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge):
         task_id = ""
         timed_out = None
         try:
-            async for step in self.run_challenge(config, timeout):
+            async for step in self.run_challenge(
+                config, timeout, mock=request.config.getoption("--mock")
+            ):
                 if not task_id:
                     task_id = step.task_id
-                if request.config.getoption("--mock"):
-                    # Run only one step in mock mode
-                    break
             timed_out = False
         except TimeoutError:
             timed_out = True
@@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge):
 
     @classmethod
     def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
-        if cls._spec.task == "" and os.getenv("IS_MOCK"):
-            yield EvalResult(
-                result="This is a mock answer",
-                result_source="step_output",
-                score=1.0,
-                passed=True,
-            )
-            return
-
         result_ground = cls._spec.ground
         outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
 
@@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge):
                         passed=score > 0.9,  # FIXME: arbitrary threshold
                     )
 
+        if result_ground.eval.type in ("python", "pytest"):
+            for py_file, output in outputs_for_eval:
+                yield EvalResult(
+                    result=output,
+                    result_source=str(py_file),
+                    score=float(not output.startswith("Error:")),
+                    passed=not output.startswith("Error:"),
+                )
+
         if result_ground.eval.type == "llm":
             combined_results = "\n".join(output[1] for output in outputs_for_eval)
             llm_eval = cls.score_result_with_llm(combined_results, result_ground)
@@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge):
                 # Otherwise, it is a specific file
                 matching_files = [os.path.join(script_dir, file_pattern)]
 
+            logger.debug(
+                f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
+            )
+
             for file_path in matching_files:
+                relative_file_path = Path(file_path).relative_to(workspace)
+                logger.debug(
+                    f"Evaluating {relative_file_path} "
+                    f"(eval type: {ground.eval.type})..."
+                )
                 if ground.eval.type == "python":
                     result = subprocess.run(
                         [sys.executable, file_path],
@@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge):
                         text=True,
                     )
                     if "error" in result.stderr or result.returncode != 0:
-                        print(result.stderr)
-                        assert False, result.stderr
-                    yield (
-                        Path(file_path).relative_to(workspace),
-                        f"Output: {result.stdout}\n",
-                    )
+                        yield relative_file_path, f"Error: {result.stderr}\n"
+                    else:
+                        yield relative_file_path, f"Output: {result.stdout}\n"
                 else:
                     with open(file_path, "r") as f:
-                        yield Path(file_path).relative_to(workspace), f.read()
+                        yield relative_file_path, f.read()
         else:
             if ground.eval.type == "pytest":
                 result = subprocess.run(
@@ -317,9 +322,9 @@ class BuiltinChallenge(BaseChallenge):
                     text=True,
                 )
                 if "error" in result.stderr or result.returncode != 0:
-                    print(result.stderr)
-                    assert False, result.stderr
-                yield "pytest", f"Output: {result.stdout}\n"
+                    yield "pytest", f"Error: {result.stderr}\n"
+                else:
+                    yield "pytest", f"Output: {result.stdout}\n"
 
     @staticmethod
     def score_result(content: str, ground: BuiltinChallengeSpec.Ground) -> float | None:
@@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge):
 
     @classmethod
     def score_result_with_llm(
-        cls, content: str, ground: BuiltinChallengeSpec.Ground
+        cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
     ) -> float:
-        if os.getenv("IS_MOCK"):
+        if mock:
             return 1.0
 
         # the validation for this is done in the Eval BaseModel
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
index 68ae89288..6993c4adc 100644
--- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
@@ -18,9 +18,7 @@
         "files": [
             "output.txt"
         ],
-        "should_contain": [
-            ""
-        ],
+        "should_contain": [],
         "should_not_contain": []
     },
     "info": {
diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
index 24f569327..a11330c1d 100644
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge):
         return results
 
     @classmethod
-    def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]:
+    def evaluate_step_result(
+        cls, step: Step, *, mock: bool = False
+    ) -> list[tuple[_Eval, EvalResult]]:
+        if mock:
+            step.output = cls.info.reference_answer
         assert step.output
         eval_results = cls.evaluate_answer(step.output)
         for eval in cls._spec.eval.evaluators:
@@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge):
         self,
         config: AgentBenchmarkConfig,
         request: pytest.FixtureRequest,
-        i_attempt: int,
+        i_attempt: int = 0,
     ) -> None:
-        if os.environ.get("HELICONE_API_KEY"):
-            from helicone.lock import HeliconeLockManager
+        # if os.environ.get("HELICONE_API_KEY"):
+        #     from helicone.lock import HeliconeLockManager
 
-            HeliconeLockManager.write_custom_property("challenge", self.info.name)
+        #     HeliconeLockManager.write_custom_property("challenge", self.info.name)
 
         timeout = 120
         if request.config.getoption("--nc"):
@@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge):
         timed_out = None
         eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
         try:
-            async for step in self.run_challenge(config, timeout):
+            async for step in self.run_challenge(
+                config, timeout, mock=request.config.getoption("--mock")
+            ):
                 if not step.output:
                     logger.warn(f"Step has no output: {step}")
                     continue
-                step_eval_results = self.evaluate_step_result(step)
+                step_eval_results = self.evaluate_step_result(
+                    step, mock=request.config.getoption("--mock")
+                )
                 logger.debug(f"Intermediary results: {step_eval_results}")
                 eval_results_per_step.append(step_eval_results)
                 if step.is_last:
diff --git a/benchmark/agbenchmark/challenges/webarena_selection.json b/benchmark/agbenchmark/challenges/webarena_selection.json
index e35a27d37..af06c7133 100644
--- a/benchmark/agbenchmark/challenges/webarena_selection.json
+++ b/benchmark/agbenchmark/challenges/webarena_selection.json
@@ -334,7 +334,7 @@
                     "June: 13 orders",
                     "July: 9 orders",
                     "August: 8 orders",
-                    "Sepetember: 10 orders",
+                    "September: 10 orders",
                     "October: 4 orders",
                     "November: 5 orders",
                     "December: 10 orders"
@@ -343,7 +343,7 @@
             "reference_url": "",
             "program_html": [],
             "string_note": "",
-            "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders  "
+            "reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders"
         },
         "intent_template_id": 270,
         "string_note": null,
author	Reinier van der Leer <pwuts@agpt.co>	2024-02-14 01:05:34 +0100
committer	Reinier van der Leer <pwuts@agpt.co>	2024-02-14 01:05:34 +0100
commit	327fb1f9166d389c434adbc44720241f46244fa8 (patch)
tree	7f02973051245fe291f937189c82f398b63cc240 /benchmark
parent	fix(agent/text_processing): Fix `extract_information` LLM response parsing (diff)
download	Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.gz Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.bz2 Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.zip