benchmark/agbenchmark/reports/reports.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

import json
import logging
import os
import sys
from pathlib import Path
from typing import Any, Dict

import pytest

from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.ReportManager import SingletonReportManager
from agbenchmark.utils.data_types import ChallengeData, DifficultyLevel
from agbenchmark.utils.utils import calculate_success_percentage

# from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone

logger = logging.getLogger(__name__)


def get_previous_test_results(
    test_name: str, info_details: dict[str, Any]
) -> list[bool]:
    mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv

    prev_test_results = SingletonReportManager().INTERNAL_INFO_MANAGER.tests.get(
        test_name, []
    )

    if not mock:
        # only add if it's an actual test
        prev_test_results.append(info_details["metrics"]["success"])
        SingletonReportManager().INTERNAL_INFO_MANAGER.add_test(
            test_name, prev_test_results
        )

    # can calculate success rate regardless of mock
    info_details["metrics"]["success_%"] = calculate_success_percentage(
        prev_test_results
    )

    return prev_test_results


def update_regression_tests(
    prev_test_results: list[bool],
    info_details: dict,
    test_name: str,
    test_details: dict,
) -> None:
    if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
        # if the last 3 tests were successful, add to the regression tests
        info_details["is_regression"] = True
        SingletonReportManager().REGRESSION_MANAGER.add_test(test_name, test_details)


def generate_single_call_report(
    item: pytest.Item,
    call: pytest.CallInfo,
    challenge_data: ChallengeData,
    answers: dict[str, Any],
    challenge_location: str,
    test_name: str,
) -> None:
    difficulty = challenge_data.info.difficulty

    if isinstance(difficulty, DifficultyLevel):
        difficulty = difficulty.value

    # Extract the challenge_location from the class
    # challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
    # test_name = item.nodeid.split("::")[1]
    # item.test_name = test_name

    test_details = {
        "difficulty": difficulty,
        "data_path": challenge_location,
    }

    info_details: Any = {
        "data_path": challenge_location,
        "is_regression": False,
        "category": challenge_data.category,
        "task": challenge_data.task,
        "answer": challenge_data.ground.answer,
        "description": challenge_data.info.description,
        "metrics": {
            "difficulty": difficulty,
            "success": False,
            "attempted": True,
        },
        # "answers": answers,
    }
    if answers:
        info_details["answers"] = answers

    if challenge_data.metadata:
        info_details["metadata"] = challenge_data.metadata

    mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv
    if call:
        if call.excinfo is None:
            info_details["metrics"]["success"] = True
        else:
            if not mock:  # don't remove if it's a mock test
                SingletonReportManager().REGRESSION_MANAGER.remove_test(test_name)
            info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
            if call.excinfo.typename == "Skipped":
                info_details["metrics"]["attempted"] = False

    prev_test_results: list[bool] = get_previous_test_results(test_name, info_details)

    update_regression_tests(prev_test_results, info_details, test_name, test_details)

    # user facing reporting
    if item:
        item.info_details = info_details

    return info_details


def finalize_reports(
    config: AgentBenchmarkConfig, item: pytest.Item, challenge_data: ChallengeData
) -> None:
    run_time = dict(item.user_properties).get("run_time")

    info_details = getattr(item, "info_details", {})
    test_name = getattr(item, "test_name", "")

    if info_details and test_name:
        if run_time is not None:
            cost = None
            # if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
            #     logger.debug("Getting cost from Helicone")
            #     cost = get_data_from_helicone(test_name)
            #     logger.debug(f"Cost: {cost}")

            info_details["metrics"]["cost"] = cost

            if info_details["metrics"].get("success", None) is None:
                info_details["metrics"]["attempted"] = False
                info_details["metrics"]["success"] = False
            elif (
                info_details["metrics"].get("success") is False
                and "attempted" not in info_details["metrics"]
            ):
                info_details["metrics"]["attempted"] = False

            info_details["metrics"]["run_time"] = f"{str(round(run_time, 3))} seconds"

            info_details["reached_cutoff"] = float(run_time) > challenge_data.cutoff

            if "--mock" not in sys.argv:
                update_challenges_already_beaten(
                    config.challenges_already_beaten_file, info_details, test_name
                )
                if info_details.get("tests") is not None:
                    for nested_test_name, nested_test_info in info_details[
                        "tests"
                    ].items():
                        update_challenges_already_beaten(
                            config.challenges_already_beaten_file,
                            nested_test_info,
                            nested_test_name,
                        )

        SingletonReportManager().INFO_MANAGER.add_test(test_name, info_details)


def update_challenges_already_beaten(
    challenges_already_beaten_file: Path, info_details: Dict[str, Any], test_name: str
) -> None:
    current_run_successful = info_details["metrics"]["success"]
    try:
        with open(challenges_already_beaten_file, "r") as f:
            challenge_data = json.load(f)
    except FileNotFoundError:
        challenge_data = {}
    challenge_beaten_in_the_past = challenge_data.get(test_name)

    challenge_data[test_name] = True
    if challenge_beaten_in_the_past is None and not current_run_successful:
        challenge_data[test_name] = False

    with open(challenges_already_beaten_file, "w") as f:
        json.dump(challenge_data, f, indent=4)


def session_finish(
    agbenchmark_config: AgentBenchmarkConfig, suite_reports: dict
) -> None:
    SingletonReportManager().INTERNAL_INFO_MANAGER.save()
    SingletonReportManager().INFO_MANAGER.end_info_report(agbenchmark_config)
    SingletonReportManager().REGRESSION_MANAGER.save()