aboutsummaryrefslogtreecommitdiff
path: root/benchmark
diff options
context:
space:
mode:
authorGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-14 01:05:34 +0100
committerGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-14 01:05:34 +0100
commit327fb1f9166d389c434adbc44720241f46244fa8 (patch)
tree7f02973051245fe291f937189c82f398b63cc240 /benchmark
parentfix(agent/text_processing): Fix `extract_information` LLM response parsing (diff)
downloadAuto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.gz
Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.tar.bz2
Auto-GPT-327fb1f9166d389c434adbc44720241f46244fa8.zip
fix(benchmark): Mock mode, python evals, `--attempts` flag, challenge definitions
- Fixed `--mock` mode - Moved interrupt to beginning of the step iterator pipeline (from `BuiltinChallenge` to `agent_api_interface.py:run_api_agent`). This ensures that any finish-up code is properly executed after executing a single step. - Implemented mock mode in `WebArenaChallenge` - Fixed `fixture 'i_attempt' not found` error when `--attempts`/`-N` is omitted - Fixed handling of `python`/`pytest` evals in `BuiltinChallenge` - Disabled left-over Helicone code (see 056163e) - Fixed a couple of challenge definitions - WebArena task 107: fix spelling of months (Sepetember, Octorbor *lmao*) - synthesize/1_basic_content_gen (SynthesizeInfo): remove empty string from `should_contain` list - Added some debug logging in agent_api_interface.py and challenges/builtin.py
Diffstat (limited to 'benchmark')
-rw-r--r--benchmark/agbenchmark/agent_api_interface.py10
-rw-r--r--benchmark/agbenchmark/challenges/base.py4
-rw-r--r--benchmark/agbenchmark/challenges/builtin.py63
-rw-r--r--benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json4
-rw-r--r--benchmark/agbenchmark/challenges/webarena.py22
-rw-r--r--benchmark/agbenchmark/challenges/webarena_selection.json4
6 files changed, 63 insertions, 44 deletions
diff --git a/benchmark/agbenchmark/agent_api_interface.py b/benchmark/agbenchmark/agent_api_interface.py
index 6eadcc537..e500f494f 100644
--- a/benchmark/agbenchmark/agent_api_interface.py
+++ b/benchmark/agbenchmark/agent_api_interface.py
@@ -23,6 +23,8 @@ async def run_api_agent(
config: AgentBenchmarkConfig,
timeout: int,
artifacts_location: Optional[Path] = None,
+ *,
+ mock: bool = False,
) -> AsyncIterator[Step]:
configuration = Configuration(host=config.host)
async with ApiClient(configuration) as api_client:
@@ -36,26 +38,32 @@ async def run_api_agent(
task_id = response.task_id
if artifacts_location:
+ logger.debug("Uploading task input artifacts to agent...")
await upload_artifacts(
api_instance, artifacts_location, task_id, "artifacts_in"
)
+ logger.debug("Running agent until finished or timeout...")
while True:
step = await api_instance.execute_agent_task_step(task_id=task_id)
yield step
if time.time() - start_time > timeout:
raise TimeoutError("Time limit exceeded")
+ if step and mock:
+ step.is_last = True
if not step or step.is_last:
break
if artifacts_location:
# In "mock" mode, we cheat by giving the correct artifacts to pass the test
- if os.getenv("IS_MOCK"):
+ if mock:
+ logger.debug("Uploading mock artifacts to agent...")
await upload_artifacts(
api_instance, artifacts_location, task_id, "artifacts_out"
)
+ logger.debug("Downloading agent artifacts...")
await download_agent_artifacts_into_folder(
api_instance, task_id, config.temp_folder
)
diff --git a/benchmark/agbenchmark/challenges/base.py b/benchmark/agbenchmark/challenges/base.py
index 64ead3a9d..4fe73a2d7 100644
--- a/benchmark/agbenchmark/challenges/base.py
+++ b/benchmark/agbenchmark/challenges/base.py
@@ -60,7 +60,7 @@ class BaseChallenge(ABC):
@classmethod
async def run_challenge(
- cls, config: AgentBenchmarkConfig, timeout: int
+ cls, config: AgentBenchmarkConfig, timeout: int, *, mock: bool = False
) -> AsyncIterator[Step]:
"""
Runs the challenge on the subject agent with the specified timeout.
@@ -89,7 +89,7 @@ class BaseChallenge(ABC):
logger.debug(f"Starting {cls.info.name} challenge run")
i = 0
async for step in run_api_agent(
- cls.info.task, config, timeout, cls.info.task_artifacts_dir
+ cls.info.task, config, timeout, cls.info.task_artifacts_dir, mock=mock
):
i += 1
print(f"[{cls.info.name}] - step {step.name} ({i}. request)")
diff --git a/benchmark/agbenchmark/challenges/builtin.py b/benchmark/agbenchmark/challenges/builtin.py
index 590696688..fd28dc3ee 100644
--- a/benchmark/agbenchmark/challenges/builtin.py
+++ b/benchmark/agbenchmark/challenges/builtin.py
@@ -158,12 +158,12 @@ class BuiltinChallenge(BaseChallenge):
self,
config: AgentBenchmarkConfig,
request: pytest.FixtureRequest,
- i_attempt: int,
+ i_attempt: int = 0,
) -> None:
- if os.environ.get("HELICONE_API_KEY"):
- from helicone.lock import HeliconeLockManager
+ # if os.environ.get("HELICONE_API_KEY"):
+ # from helicone.lock import HeliconeLockManager
- HeliconeLockManager.write_custom_property("challenge", self.info.name)
+ # HeliconeLockManager.write_custom_property("challenge", self.info.name)
timeout = self._spec.cutoff or 60
@@ -175,12 +175,11 @@ class BuiltinChallenge(BaseChallenge):
task_id = ""
timed_out = None
try:
- async for step in self.run_challenge(config, timeout):
+ async for step in self.run_challenge(
+ config, timeout, mock=request.config.getoption("--mock")
+ ):
if not task_id:
task_id = step.task_id
- if request.config.getoption("--mock"):
- # Run only one step in mock mode
- break
timed_out = False
except TimeoutError:
timed_out = True
@@ -230,15 +229,6 @@ class BuiltinChallenge(BaseChallenge):
@classmethod
def evaluate_workspace_content(cls, workspace: Path) -> Iterator[EvalResult]:
- if cls._spec.task == "" and os.getenv("IS_MOCK"):
- yield EvalResult(
- result="This is a mock answer",
- result_source="step_output",
- score=1.0,
- passed=True,
- )
- return
-
result_ground = cls._spec.ground
outputs_for_eval = cls.get_outputs_for_eval(workspace, result_ground)
@@ -254,6 +244,15 @@ class BuiltinChallenge(BaseChallenge):
passed=score > 0.9, # FIXME: arbitrary threshold
)
+ if result_ground.eval.type in ("python", "pytest"):
+ for py_file, output in outputs_for_eval:
+ yield EvalResult(
+ result=output,
+ result_source=str(py_file),
+ score=float(not output.startswith("Error:")),
+ passed=not output.startswith("Error:"),
+ )
+
if result_ground.eval.type == "llm":
combined_results = "\n".join(output[1] for output in outputs_for_eval)
llm_eval = cls.score_result_with_llm(combined_results, result_ground)
@@ -290,7 +289,16 @@ class BuiltinChallenge(BaseChallenge):
# Otherwise, it is a specific file
matching_files = [os.path.join(script_dir, file_pattern)]
+ logger.debug(
+ f"Files to evaluate for pattern `{file_pattern}`: {matching_files}"
+ )
+
for file_path in matching_files:
+ relative_file_path = Path(file_path).relative_to(workspace)
+ logger.debug(
+ f"Evaluating {relative_file_path} "
+ f"(eval type: {ground.eval.type})..."
+ )
if ground.eval.type == "python":
result = subprocess.run(
[sys.executable, file_path],
@@ -299,15 +307,12 @@ class BuiltinChallenge(BaseChallenge):
text=True,
)
if "error" in result.stderr or result.returncode != 0:
- print(result.stderr)
- assert False, result.stderr
- yield (
- Path(file_path).relative_to(workspace),
- f"Output: {result.stdout}\n",
- )
+ yield relative_file_path, f"Error: {result.stderr}\n"
+ else:
+ yield relative_file_path, f"Output: {result.stdout}\n"
else:
with open(file_path, "r") as f:
- yield Path(file_path).relative_to(workspace), f.read()
+ yield relative_file_path, f.read()
else:
if ground.eval.type == "pytest":
result = subprocess.run(
@@ -317,9 +322,9 @@ class BuiltinChallenge(BaseChallenge):
text=True,
)
if "error" in result.stderr or result.returncode != 0:
- print(result.stderr)
- assert False, result.stderr
- yield "pytest", f"Output: {result.stdout}\n"
+ yield "pytest", f"Error: {result.stderr}\n"
+ else:
+ yield "pytest", f"Output: {result.stdout}\n"
@staticmethod
def score_result(content: str, ground: BuiltinChallengeSpec.Ground) -> float | None:
@@ -358,9 +363,9 @@ class BuiltinChallenge(BaseChallenge):
@classmethod
def score_result_with_llm(
- cls, content: str, ground: BuiltinChallengeSpec.Ground
+ cls, content: str, ground: BuiltinChallengeSpec.Ground, *, mock: bool = False
) -> float:
- if os.getenv("IS_MOCK"):
+ if mock:
return 1.0
# the validation for this is done in the Eval BaseModel
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
index 68ae89288..6993c4adc 100644
--- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
@@ -18,9 +18,7 @@
"files": [
"output.txt"
],
- "should_contain": [
- ""
- ],
+ "should_contain": [],
"should_not_contain": []
},
"info": {
diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
index 24f569327..a11330c1d 100644
--- a/benchmark/agbenchmark/challenges/webarena.py
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -317,7 +317,11 @@ class WebArenaChallenge(BaseChallenge):
return results
@classmethod
- def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]:
+ def evaluate_step_result(
+ cls, step: Step, *, mock: bool = False
+ ) -> list[tuple[_Eval, EvalResult]]:
+ if mock:
+ step.output = cls.info.reference_answer
assert step.output
eval_results = cls.evaluate_answer(step.output)
for eval in cls._spec.eval.evaluators:
@@ -356,12 +360,12 @@ class WebArenaChallenge(BaseChallenge):
self,
config: AgentBenchmarkConfig,
request: pytest.FixtureRequest,
- i_attempt: int,
+ i_attempt: int = 0,
) -> None:
- if os.environ.get("HELICONE_API_KEY"):
- from helicone.lock import HeliconeLockManager
+ # if os.environ.get("HELICONE_API_KEY"):
+ # from helicone.lock import HeliconeLockManager
- HeliconeLockManager.write_custom_property("challenge", self.info.name)
+ # HeliconeLockManager.write_custom_property("challenge", self.info.name)
timeout = 120
if request.config.getoption("--nc"):
@@ -372,11 +376,15 @@ class WebArenaChallenge(BaseChallenge):
timed_out = None
eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
try:
- async for step in self.run_challenge(config, timeout):
+ async for step in self.run_challenge(
+ config, timeout, mock=request.config.getoption("--mock")
+ ):
if not step.output:
logger.warn(f"Step has no output: {step}")
continue
- step_eval_results = self.evaluate_step_result(step)
+ step_eval_results = self.evaluate_step_result(
+ step, mock=request.config.getoption("--mock")
+ )
logger.debug(f"Intermediary results: {step_eval_results}")
eval_results_per_step.append(step_eval_results)
if step.is_last:
diff --git a/benchmark/agbenchmark/challenges/webarena_selection.json b/benchmark/agbenchmark/challenges/webarena_selection.json
index e35a27d37..af06c7133 100644
--- a/benchmark/agbenchmark/challenges/webarena_selection.json
+++ b/benchmark/agbenchmark/challenges/webarena_selection.json
@@ -334,7 +334,7 @@
"June: 13 orders",
"July: 9 orders",
"August: 8 orders",
- "Sepetember: 10 orders",
+ "September: 10 orders",
"October: 4 orders",
"November: 5 orders",
"December: 10 orders"
@@ -343,7 +343,7 @@
"reference_url": "",
"program_html": [],
"string_note": "",
- "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders "
+ "reference_answer_raw_annotation": "May: 8 orders; June: 13 orders; July: 9 orders; August: 8 orders; September: 10 orders; October: 4 orders; November: 5 orders; December: 10 orders"
},
"intent_template_id": 270,
"string_note": null,