diff options
author | Reinier van der Leer <pwuts@agpt.co> | 2024-02-17 15:48:57 +0100 |
---|---|---|
committer | Reinier van der Leer <pwuts@agpt.co> | 2024-02-17 15:48:57 +0100 |
commit | 4ede773f5a8639ed41d8b33e3325608f35cebd4f (patch) | |
tree | 396c7f7edb049c693dcc825aed869424fb9a317a /benchmark | |
parent | ci: Allow telemetry for non-push events, as long as it's on `master` (diff) | |
download | Auto-GPT-4ede773f5a8639ed41d8b33e3325608f35cebd4f.tar.gz Auto-GPT-4ede773f5a8639ed41d8b33e3325608f35cebd4f.tar.bz2 Auto-GPT-4ede773f5a8639ed41d8b33e3325608f35cebd4f.zip |
debug(benchmark): Add more debug code to pinpoint cause of rare crash
Target: https://github.com/Significant-Gravitas/AutoGPT/actions/runs/7941977633/job/21684817491
Diffstat (limited to 'benchmark')
-rw-r--r-- | benchmark/agbenchmark/reports/processing/report_types.py | 2 | ||||
-rw-r--r-- | benchmark/agbenchmark/reports/reports.py | 36 |
2 files changed, 23 insertions, 15 deletions
diff --git a/benchmark/agbenchmark/reports/processing/report_types.py b/benchmark/agbenchmark/reports/processing/report_types.py index a3ad8af42..0475455a7 100644 --- a/benchmark/agbenchmark/reports/processing/report_types.py +++ b/benchmark/agbenchmark/reports/processing/report_types.py @@ -33,7 +33,7 @@ class TestResult(BaseModel): logger.error( "Error validating `success ^ fail_reason` on TestResult: " f"success = {repr(values['success'])}; " - f"fail_reason = {repr(v)} ({v})" + f"fail_reason = {repr(v)}" ) if v: success = values["success"] diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py index 60accd586..538046028 100644 --- a/benchmark/agbenchmark/reports/reports.py +++ b/benchmark/agbenchmark/reports/reports.py @@ -4,6 +4,7 @@ import os from pathlib import Path import pytest +from pydantic import ValidationError from agbenchmark.challenges import ChallengeInfo from agbenchmark.config import AgentBenchmarkConfig @@ -86,21 +87,28 @@ def add_test_result_to_report( else: test_report.metrics.attempted = True - test_report.results.append( - TestResult( - success=call.excinfo is None, - run_time=f"{str(round(call.duration, 3))} seconds", - fail_reason=None if call.excinfo is None else str(call.excinfo.value), - reached_cutoff=user_properties.get("timed_out", False), - n_steps=user_properties.get("n_steps"), - cost=user_properties.get("agent_task_cost"), + try: + test_report.results.append( + TestResult( + success=call.excinfo is None, + run_time=f"{str(round(call.duration, 3))} seconds", + fail_reason=None if call.excinfo is None else str(call.excinfo.value), + reached_cutoff=user_properties.get("timed_out", False), + n_steps=user_properties.get("n_steps"), + cost=user_properties.get("agent_task_cost"), + ) ) - ) - test_report.metrics.success_percentage = ( - sum(r.success or False for r in test_report.results) - / len(test_report.results) - * 100 - ) + test_report.metrics.success_percentage = ( + sum(r.success or False for r in test_report.results) + / len(test_report.results) + * 100 + ) + except ValidationError: + logger.error( + "Validation failed on TestResult; " + f"call.excinfo = {repr(call.excinfo)} ({call.excinfo})" + ) + raise prev_test_results: list[bool | None] = get_and_update_success_history( test_name, test_report.results[-1].success |