aboutsummaryrefslogtreecommitdiff
path: root/benchmark
diff options
context:
space:
mode:
authorGravatar Reinier van der Leer <pwuts@agpt.co> 2024-01-19 19:52:09 +0100
committerGravatar Reinier van der Leer <pwuts@agpt.co> 2024-01-19 19:52:09 +0100
commit05b018a837a67233c57e336942afd9ad374ac2d8 (patch)
tree00d15a95445a35d11958a8a096aebd31ef3e844b /benchmark
parentfeat(agent/llm/openai): Include compatibility tool call extraction in LLM res... (diff)
downloadAuto-GPT-05b018a837a67233c57e336942afd9ad374ac2d8.tar.gz
Auto-GPT-05b018a837a67233c57e336942afd9ad374ac2d8.tar.bz2
Auto-GPT-05b018a837a67233c57e336942afd9ad374ac2d8.zip
fix(benchmark/report): Fix and clean up logic in `update_challenges_already_beaten`
- `update_challenges_already_beaten` incorrectly marked challenges as beaten if it was present in the report file but set to `false`
Diffstat (limited to 'benchmark')
-rw-r--r--benchmark/agbenchmark/reports/reports.py13
1 files changed, 5 insertions, 8 deletions
diff --git a/benchmark/agbenchmark/reports/reports.py b/benchmark/agbenchmark/reports/reports.py
index 1e6186f2f..7b03233e4 100644
--- a/benchmark/agbenchmark/reports/reports.py
+++ b/benchmark/agbenchmark/reports/reports.py
@@ -96,7 +96,6 @@ def finalize_test_report(
mock = os.getenv("IS_MOCK") # Check if --mock is in sys.argv
- logger.debug(f"Finalizing report with CallInfo: {vars(call)}")
if call.excinfo is None:
info_details.metrics.success = True
else:
@@ -135,17 +134,15 @@ def update_challenges_already_beaten(
current_run_successful = info_details.metrics.success
try:
with open(challenges_already_beaten_file, "r") as f:
- challenge_data = json.load(f)
+ challenges_beaten_before = json.load(f)
except FileNotFoundError:
- challenge_data = {}
- challenge_beaten_in_the_past = challenge_data.get(test_name)
+ challenges_beaten_before = {}
- challenge_data[test_name] = True
- if challenge_beaten_in_the_past is None and not current_run_successful:
- challenge_data[test_name] = False
+ has_ever_been_beaten = challenges_beaten_before.get(test_name)
+ challenges_beaten_before[test_name] = has_ever_been_beaten or current_run_successful
with open(challenges_already_beaten_file, "w") as f:
- json.dump(challenge_data, f, indent=4)
+ json.dump(challenges_beaten_before, f, indent=4)
def session_finish(agbenchmark_config: AgentBenchmarkConfig) -> None: