diff options
author | Reinier van der Leer <pwuts@agpt.co> | 2024-02-14 01:05:34 +0100 |
---|---|---|
committer | Reinier van der Leer <pwuts@agpt.co> | 2024-02-14 01:05:34 +0100 |
commit | 327fb1f9166d389c434adbc44720241f46244fa8 (patch) | |
tree | 7f02973051245fe291f937189c82f398b63cc240 /benchmark | |
parent | fix(agent/text_processing): Fix `extract_information` LLM response parsing (diff) | |
download | Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.gz Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.bz2 Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.zip |
fix(benchmark): Mock mode, python evals, `--attempts` flag, challenge definitions
- Fixed `--mock` mode
- Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step.
- Implemented mock mode in `WebArenaChallenge`
- Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted
- Fixed handling of `python`/`pytest` evals in `BuiltinChallenge`
- Disabled left-over Helicone code (see 056163e)
- Fixed a couple of challenge definitions
- WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*)
- synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list
- Added some debug logging in agent_api_interface.py and challenges/builtin.py
Diffstat (limited to 'benchmark')
6 files changed, 63 insertions, 44 deletions
diff --git a/benchmark/agbenchmark/agent_api_interface.py b/benchmark/agbenchmark/agent_api_interface.py index 6eadcc537..e500f494f 100644 --- a/benchmark/agbenchmark/agent_api_interface.py +++ b/benchmark/agbenchmark/agent_api_interface.py @@ -23,6 +23,8 @@ async def run_api_agent( config: AgentBenchmarkConfig, timeout: int, artifacts_location: Optional[Path] = None, + *, + mock: bool = False, ) -> AsyncIterator[Step]: configuration = Configuration(host=config.host) async with ApiClient(configuration) as api_client: @@ -36,26 +38,32 @@ async def run_api_agent( task_id = response.task_id if artifacts_location: + logger.debug("Uploading task input artifacts to agent...") await upload_artifacts( api_instance, artifacts_location, task_id, "artifacts_in" ) + logger.debug("Running agent until finished or timeout...") while True: step = await api_instance.execute_agent_task_step(task_id=task_id) yield step if time.time() - start_time > timeout: raise TimeoutError("Time limit exceeded") + if step and mock: + step.is_last = True if not step or step.is_last: break if artifacts_location: # In "mock" mode, we cheat by giving the correct artifacts to pass the test - if os.getenv("IS_MOCK"): + if mock: + logger.debug("Uploading mock artifacts to agent...") await upload_artifacts( api_instance, artifacts_location, task_id, "artifacts_out" ) + logger.debug("Downloading agent artifacts...") await download_agent_artifacts_into_folder( api_instance, task_id, config.temp_folder ) diff --git a/benchmark/agbenchmark/challenges/base.py b/benchmark/agbenchmark/challenges/base.py index 64ead3a9d..4fe73a2d7 100644 --- a/benchmark/agbenchmark/challenges/base.py +++ b/benchmark/agbenchmark/challenges/base.py @@ -60,7 +60,7 @@ class BaseChallenge(ABC): @classmethod async def run_challenge( - cls, config: AgentBenchmarkConfig, timeout: int + cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False ) -> AsyncIterator[Step]: """ Runs the challenge on the subject agent with the specified timeout. @@ -89,7 +89,7 @@ class BaseChallenge(ABC): logger.debug(f"Starting {cls.info.name} challenge run") i = 0 async for step in run_api_agent( - cls.info.task, config, timeout, cls.info.task_artifacts_dir + cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock ): i += 1 print(f"[{cls.info.name}] - step {step.name} ({i}. request)") diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py index 590696688..fd28dc3ee 100644 --- a/benchmark/agbenchmark/challenges/builtin.py +++ b/benchmark/agbenchmark/challenges/builtin.py @@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge): self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest, - i_attempt: int, + i_attempt: int = 0, ) -> None: - if os.environ.get("HELICONE_API_KEY"): - from helicone.lock import HeliconeLockManager + # if os.environ.get("HELICONE_API_KEY"): + # from helicone.lock import HeliconeLockManager - HeliconeLockManager.write_custom_property("challenge", self.info.name) + # HeliconeLockManager.write_custom_property("challenge", self.info.name) timeout = self._spec.cutoff or 60 @@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge): task_id = "" timed_out = None try: - async for step in self.run_challenge(config, timeout): + async for step in self.run_challenge( + config, timeout, mock=request.config.getoption("--mock") + ): if not task_id: task_id = step.task_id - if request.config.getoption("--mock"): - # Run only one step in mock mode - break timed_out = False except TimeoutError: timed_out = True @@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge): @classmethod def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]: - if cls._spec.task == "" and os.getenv("IS_MOCK"): - yield EvalResult( - result="This is a mock answer", - result_source="step_output", - score=1.0, - passed=True, - ) - return - result_ground = cls._spec.ground outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground) @@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge): passed=score > 0.9, # FIXME: arbitrary threshold ) + if result_ground.eval.type in ("python", "pytest"): + for py_file, output in outputs_for_eval: + yield EvalResult( + result=output, + result_source=str(py_file), + score=float(not output.startswith("Error:")), + passed=not output.startswith("Error:"), + ) + if result_ground.eval.type == "llm": combined_results = "\n".join(output[1] for output in outputs_for_eval) llm_eval = cls.score_result_with_llm(combined_results, result_ground) @@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge): # Otherwise, it is a specific file matching_files = [os.path.join(script_dir, file_pattern)] + logger.debug( + f"Files to evaluate for pattern `{file_pattern}`: {matching_files}" + ) + for file_path in matching_files: + relative_file_path = Path(file_path).relative_to(workspace) + logger.debug( + f"Evaluating {relative_file_path} " + f"(eval type: {ground.eval.type})..." + ) if ground.eval.type == "python": result = subprocess.run( [sys.executable, file_path], @@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge): text=True, ) if "error" in result.stderr or result.returncode != 0: - print(result.stderr) - assert False, result.stderr - yield ( - Path(file_path).relative_to(workspace), - f"Output: {result.stdout}\n", - ) + yield relative_file_path, f"Error: {result.stderr}\n" + else: + yield relative_file_path, f"Output: {result.stdout}\n" else: with open(file_path, "r") as f: - yield Path(file_path).relative_to(workspace), f.read() + yield relative_file_path, f.read() else: if ground.eval.type == "pytest": result = subprocess.run( @@ -317,9 +322,9 @@ class BuiltinChallenge(BaseChallenge): text=True, ) if "error" in result.stderr or result.returncode != 0: - print(result.stderr) - assert False, result.stderr - yield "pytest", f"Output: {result.stdout}\n" + yield "pytest", f"Error: {result.stderr}\n" + else: + yield "pytest", f"Output: {result.stdout}\n" @staticmethod def score_result(content: str, ground: BuiltinChallengeSpec.Ground) -> float | None: @@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge): @classmethod def score_result_with_llm( - cls, content: str, ground: BuiltinChallengeSpec.Ground + cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False ) -> float: - if os.getenv("IS_MOCK"): + if mock: return 1.0 # the validation for this is done in the Eval BaseModel diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json index 68ae89288..6993c4adc 100644 --- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json +++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json @@ -18,9 +18,7 @@ "files": [ "output.txt" ], - "should_contain": [ - "" - ], + "should_contain": [], "should_not_contain": [] }, "info": { diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py index 24f569327..a11330c1d 100644 --- a/benchmark/agbenchmark/challenges/webarena.py +++ b/benchmark/agbenchmark/challenges/webarena.py @@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge): return results @classmethod - def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]: + def evaluate_step_result( + cls, step: Step, *, mock: bool = False + ) -> list[tuple[_Eval, EvalResult]]: + if mock: + step.output = cls.info.reference_answer assert step.output eval_results = cls.evaluate_answer(step.output) for eval in cls._spec.eval.evaluators: @@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge): self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest, - i_attempt: int, + i_attempt: int = 0, ) -> None: - if os.environ.get("HELICONE_API_KEY"): - from helicone.lock import HeliconeLockManager + # if os.environ.get("HELICONE_API_KEY"): + # from helicone.lock import HeliconeLockManager - HeliconeLockManager.write_custom_property("challenge", self.info.name) + # HeliconeLockManager.write_custom_property("challenge", self.info.name) timeout = 120 if request.config.getoption("--nc"): @@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge): timed_out = None eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = [] try: - async for step in self.run_challenge(config, timeout): + async for step in self.run_challenge( + config, timeout, mock=request.config.getoption("--mock") + ): if not step.output: logger.warn(f"Step has no output: {step}") continue - step_eval_results = self.evaluate_step_result(step) + step_eval_results = self.evaluate_step_result( + step, mock=request.config.getoption("--mock") + ) logger.debug(f"Intermediary results: {step_eval_results}") eval_results_per_step.append(step_eval_results) if step.is_last: diff --git a/benchmark/agbenchmark/challenges/webarena_selection.json b/benchmark/agbenchmark/challenges/webarena_selection.json index e35a27d37..af06c7133 100644 --- a/benchmark/agbenchmark/challenges/webarena_selection.json +++ b/benchmark/agbenchmark/challenges/webarena_selection.json @@ -334,7 +334,7 @@ "June: 13 orders", "July: 9 orders", "August: 8 orders", - "Sepetember: 10 orders", + "September: 10 orders", "October: 4 orders", "November: 5 orders", "December: 10 orders" @@ -343,7 +343,7 @@ "reference_url": "", "program_html": [], "string_note": "", - "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " + "reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders" }, "intent_template_id": 270, "string_note": null, |