aboutsummaryrefslogtreecommitdiff
path: root/benchmark
diff options
context:
space:
mode:
authorGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-17 15:48:57 +0100
committerGravatar Reinier van der Leer <pwuts@agpt.co> 2024-02-17 15:48:57 +0100
commit4ede773f5a8639ed41d8b33e3325608f35cebd4f (patch)
tree396c7f7edb049c693dcc825aed869424fb9a317a /benchmark
parentci: Allow telemetry for non-push events, as long as it's on `master` (diff)
downloadAuto-GPT-4ede773f5a8639ed41d8b33e3325608f35cebd4f.tar.gz
Auto-GPT-4ede773f5a8639ed41d8b33e3325608f35cebd4f.tar.bz2
Auto-GPT-4ede773f5a8639ed41d8b33e3325608f35cebd4f.zip
debug(benchmark): Add more debug code to pinpoint cause of rare crash
Target: https://github.com/Significant-Gravitas/AutoGPT/actions/runs/7941977633/job/21684817491
Diffstat (limited to 'benchmark')
-rw-r--r--benchmark/agbenchmark/reports/processing/report_types.py2
-rw-r--r--benchmark/agbenchmark/reports/reports.py36
2 files changed, 23 insertions, 15 deletions
diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py
index a3ad8af42..0475455a7 100644
--- a/benchmark/agbenchmark/reports/processing/report_types.py
+++ b/benchmark/agbenchmark/reports/processing/report_types.py
@@ -33,7 +33,7 @@ class TestResult(BaseModel):
logger.error(
"Error validating `success ^ fail_reason` on TestResult: "
f"success = {repr(values['success'])}; "
- f"fail_reason = {repr(v)} ({v})"
+ f"fail_reason = {repr(v)}"
)
if v:
success = values["success"]
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index 60accd586..538046028 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -4,6 +4,7 @@ import os
from pathlib import Path
import pytest
+from pydantic import ValidationError
from agbenchmark.challenges import ChallengeInfo
from agbenchmark.config import AgentBenchmarkConfig
@@ -86,21 +87,28 @@ def add_test_result_to_report(
else:
test_report.metrics.attempted = True
- test_report.results.append(
- TestResult(
- success=call.excinfo is None,
- run_time=f"{str(round(call.duration, 3))} seconds",
- fail_reason=None if call.excinfo is None else str(call.excinfo.value),
- reached_cutoff=user_properties.get("timed_out", False),
- n_steps=user_properties.get("n_steps"),
- cost=user_properties.get("agent_task_cost"),
+ try:
+ test_report.results.append(
+ TestResult(
+ success=call.excinfo is None,
+ run_time=f"{str(round(call.duration, 3))} seconds",
+ fail_reason=None if call.excinfo is None else str(call.excinfo.value),
+ reached_cutoff=user_properties.get("timed_out", False),
+ n_steps=user_properties.get("n_steps"),
+ cost=user_properties.get("agent_task_cost"),
+ )
)
- )
- test_report.metrics.success_percentage = (
- sum(r.success or False for r in test_report.results)
- / len(test_report.results)
- * 100
- )
+ test_report.metrics.success_percentage = (
+ sum(r.success or False for r in test_report.results)
+ / len(test_report.results)
+ * 100
+ )
+ except ValidationError:
+ logger.error(
+ "Validation failed on TestResult; "
+ f"call.excinfo = {repr(call.excinfo)} ({call.excinfo})"
+ )
+ raise
prev_test_results: list[bool | None] = get_and_update_success_history(
test_name, test_report.results[-1].success