diff options
Diffstat (limited to 'autogpts/autogpt/agbenchmark_config/analyze_reports.py')
-rw-r--r-- | autogpts/autogpt/agbenchmark_config/analyze_reports.py | 143 |
1 files changed, 143 insertions, 0 deletions
diff --git a/autogpts/autogpt/agbenchmark_config/analyze_reports.py b/autogpts/autogpt/agbenchmark_config/analyze_reports.py new file mode 100644 index 000000000..2a8f95443 --- /dev/null +++ b/autogpts/autogpt/agbenchmark_config/analyze_reports.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +import json +import logging +import re +import sys +from collections import defaultdict +from pathlib import Path + +from tabulate import tabulate + +info = "-v" in sys.argv +debug = "-vv" in sys.argv +granular = "--granular" in sys.argv + +logging.basicConfig( + level=logging.DEBUG if debug else logging.INFO if info else logging.WARNING +) +logger = logging.getLogger(__name__) + +# Get a list of all JSON files in the directory +report_files = [ + report_file + for dir in (Path(__file__).parent / "reports").iterdir() + if re.match(r"^\d{8}T\d{6}_", dir.name) + and (report_file := dir / "report.json").is_file() +] + +labels = list[str]() +runs_per_label = defaultdict[str, int](lambda: 0) +suite_names = list[str]() +test_names = list[str]() + +# Create a dictionary to store grouped success values by suffix and test +grouped_success_values = defaultdict[str, list[str]](list[str]) + +# Loop through each JSON file to collect suffixes and success values +for report_file in sorted(report_files): + with open(report_file) as f: + logger.info(f"Loading {report_file}...") + + data = json.load(f) + if "tests" in data: + test_tree = data["tests"] + label = data["agent_git_commit_sha"].rsplit("/", 1)[1][:7] # commit hash + else: + # Benchmark run still in progress + test_tree = data + label = report_file.parent.name.split("_", 1)[1] + logger.info(f"Run '{label}' seems to be in progress") + + runs_per_label[label] += 1 + + def process_test(test_name: str, test_data: dict): + result_group = grouped_success_values[f"{label}|{test_name}"] + + if "tests" in test_data: + logger.debug(f"{test_name} is a test suite") + + # Test suite + suite_attempted = any( + test["metrics"]["attempted"] for test in test_data["tests"].values() + ) + logger.debug(f"suite_attempted: {suite_attempted}") + if not suite_attempted: + return + + if test_name not in test_names: + test_names.append(test_name) + + if test_data["metrics"]["percentage"] == 0: + result_indicator = "❌" + else: + highest_difficulty = test_data["metrics"]["highest_difficulty"] + result_indicator = { + "interface": "🔌", + "novice": "🌑", + "basic": "🌒", + "intermediate": "🌓", + "advanced": "🌔", + "hard": "🌕", + }[highest_difficulty] + + logger.debug(f"result group: {result_group}") + logger.debug(f"runs_per_label: {runs_per_label[label]}") + if len(result_group) + 1 < runs_per_label[label]: + result_group.extend( + ["❔"] * (runs_per_label[label] - len(result_group) - 1) + ) + result_group.append(result_indicator) + logger.debug(f"result group (after): {result_group}") + + if granular: + for test_name, test in test_data["tests"].items(): + process_test(test_name, test) + return + + test_metrics = test_data["metrics"] + result_indicator = "❔" + + if "attempted" not in test_metrics: + return + elif test_metrics["attempted"]: + if test_name not in test_names: + test_names.append(test_name) + + success_value = test_metrics["success"] + result_indicator = {True: "✅", False: "❌"}[success_value] + + if len(result_group) + 1 < runs_per_label[label]: + result_group.extend( + [" "] * (runs_per_label[label] - len(result_group) - 1) + ) + result_group.append(result_indicator) + + for test_name, suite in test_tree.items(): + try: + process_test(test_name, suite) + except KeyError: + print(f"{test_name}.metrics: {suite['metrics']}") + raise + + if label not in labels: + labels.append(label) + +# Create headers +headers = ["Test Name"] + list(labels) + +# Prepare data for tabulation +table_data = list[list[str]]() +for test_name in test_names: + row = [test_name] + for label in labels: + results = grouped_success_values.get(f"{label}|{test_name}", ["❔"]) + if len(results) < runs_per_label[label]: + results.extend(["❔"] * (runs_per_label[label] - len(results))) + if len(results) > 1 and all(r == "❔" for r in results): + results.clear() + row.append(" ".join(results)) + table_data.append(row) + +# Print tabulated data +print(tabulate(table_data, headers=headers, tablefmt="grid")) |