autogpts/autogpt/agbenchmark_config/analyze_reports.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

#!/usr/bin/env python3

import json
import logging
import re
import sys
from collections import defaultdict
from pathlib import Path

from tabulate import tabulate

info = "-v" in sys.argv
debug = "-vv" in sys.argv
granular = "--granular" in sys.argv

logging.basicConfig(
    level=logging.DEBUG if debug else logging.INFO if info else logging.WARNING
)
logger = logging.getLogger(__name__)

# Get a list of all JSON files in the directory
report_files = [
    report_file
    for dir in (Path(__file__).parent / "reports").iterdir()
    if re.match(r"^\d{8}T\d{6}_", dir.name)
    and (report_file := dir / "report.json").is_file()
]

labels = list[str]()
runs_per_label = defaultdict[str, int](lambda: 0)
suite_names = list[str]()
test_names = list[str]()

# Create a dictionary to store grouped success values by suffix and test
grouped_success_values = defaultdict[str, list[str]](list[str])

# Loop through each JSON file to collect suffixes and success values
for report_file in sorted(report_files):
    with open(report_file) as f:
        logger.info(f"Loading {report_file}...")

        data = json.load(f)
        if "tests" in data:
            test_tree = data["tests"]
            label = data["agent_git_commit_sha"].rsplit("/", 1)[1][:7]  # commit hash
        else:
            # Benchmark run still in progress
            test_tree = data
            label = report_file.parent.name.split("_", 1)[1]
            logger.info(f"Run '{label}' seems to be in progress")

        runs_per_label[label] += 1

        def process_test(test_name: str, test_data: dict):
            result_group = grouped_success_values[f"{label}|{test_name}"]

            if "tests" in test_data:
                logger.debug(f"{test_name} is a test suite")

                # Test suite
                suite_attempted = any(
                    test["metrics"]["attempted"] for test in test_data["tests"].values()
                )
                logger.debug(f"suite_attempted: {suite_attempted}")
                if not suite_attempted:
                    return

                if test_name not in test_names:
                    test_names.append(test_name)

                if test_data["metrics"]["percentage"] == 0:
                    result_indicator = "❌"
                else:
                    highest_difficulty = test_data["metrics"]["highest_difficulty"]
                    result_indicator = {
                        "interface": "🔌",
                        "novice": "🌑",
                        "basic": "🌒",
                        "intermediate": "🌓",
                        "advanced": "🌔",
                        "hard": "🌕",
                    }[highest_difficulty]

                logger.debug(f"result group: {result_group}")
                logger.debug(f"runs_per_label: {runs_per_label[label]}")
                if len(result_group) + 1 < runs_per_label[label]:
                    result_group.extend(
                        ["❔"] * (runs_per_label[label] - len(result_group) - 1)
                    )
                result_group.append(result_indicator)
                logger.debug(f"result group (after): {result_group}")

                if granular:
                    for test_name, test in test_data["tests"].items():
                        process_test(test_name, test)
                return

            test_metrics = test_data["metrics"]
            result_indicator = "❔"

            if "attempted" not in test_metrics:
                return
            elif test_metrics["attempted"]:
                if test_name not in test_names:
                    test_names.append(test_name)

                success_value = test_metrics["success"]
                result_indicator = {True: "✅", False: "❌"}[success_value]

            if len(result_group) + 1 < runs_per_label[label]:
                result_group.extend(
                    ["  "] * (runs_per_label[label] - len(result_group) - 1)
                )
            result_group.append(result_indicator)

        for test_name, suite in test_tree.items():
            try:
                process_test(test_name, suite)
            except KeyError:
                print(f"{test_name}.metrics: {suite['metrics']}")
                raise

    if label not in labels:
        labels.append(label)

# Create headers
headers = ["Test Name"] + list(labels)

# Prepare data for tabulation
table_data = list[list[str]]()
for test_name in test_names:
    row = [test_name]
    for label in labels:
        results = grouped_success_values.get(f"{label}|{test_name}", ["❔"])
        if len(results) < runs_per_label[label]:
            results.extend(["❔"] * (runs_per_label[label] - len(results)))
        if len(results) > 1 and all(r == "❔" for r in results):
            results.clear()
        row.append(" ".join(results))
    table_data.append(row)

# Print tabulated data
print(tabulate(table_data, headers=headers, tablefmt="grid"))