1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
|
import json
import os
import sys
from typing import Any, Dict
from agbenchmark.__main__ import CHALLENGES_ALREADY_BEATEN
from agbenchmark.reports.agent_benchmark_config import get_agent_benchmark_config
from agbenchmark.reports.ReportManager import SingletonReportManager
from agbenchmark.utils.data_types import DifficultyLevel
from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone
from agbenchmark.utils.utils import calculate_success_percentage
def get_previous_test_results(
test_name: str, info_details: dict[str, Any]
) -> list[bool]:
agent_tests: dict[str, list[bool]] = {}
mock = os.getenv("IS_MOCK") # Check if --mock is in sys.argv
prev_test_results = SingletonReportManager().INTERNAL_INFO_MANAGER.tests.get(
test_name, []
)
if not mock:
# only add if it's an actual test
prev_test_results.append(info_details["metrics"]["success"])
SingletonReportManager().INTERNAL_INFO_MANAGER.add_test(
test_name, prev_test_results
)
# can calculate success rate regardless of mock
info_details["metrics"]["success_%"] = calculate_success_percentage(
prev_test_results
)
return prev_test_results
def update_regression_tests(
prev_test_results: list[bool],
info_details: dict,
test_name: str,
test_details: dict,
) -> None:
if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
# if the last 3 tests were successful, add to the regression tests
info_details["is_regression"] = True
SingletonReportManager().REGRESSION_MANAGER.add_test(test_name, test_details)
def generate_single_call_report(
item: Any,
call: Any,
challenge_data: dict[str, Any],
answers: dict[str, Any],
challenge_location,
test_name,
) -> None:
try:
difficulty = challenge_data["info"]["difficulty"]
except KeyError:
return None
if isinstance(difficulty, DifficultyLevel):
difficulty = difficulty.value
# Extract the challenge_location from the class
# challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
# test_name = item.nodeid.split("::")[1]
# item.test_name = test_name
test_details = {
"difficulty": difficulty,
"data_path": challenge_location,
}
info_details: Any = {
"data_path": challenge_location,
"is_regression": False,
"category": challenge_data["category"],
"task": challenge_data["task"],
"answer": challenge_data["ground"]["answer"],
"description": challenge_data["info"]["description"],
"metrics": {
"difficulty": difficulty,
"success": False,
"attempted": True,
},
# "answers": answers,
}
if answers:
info_details["answers"] = answers
if "metadata" in challenge_data:
info_details["metadata"] = challenge_data["metadata"]
mock = os.getenv("IS_MOCK") # Check if --mock is in sys.argv
if call:
if call.excinfo is None:
info_details["metrics"]["success"] = True
else:
if not mock: # don't remove if it's a mock test
SingletonReportManager().REGRESSION_MANAGER.remove_test(test_name)
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
if call.excinfo.typename == "Skipped":
info_details["metrics"]["attempted"] = False
prev_test_results: list[bool] = get_previous_test_results(test_name, info_details)
update_regression_tests(prev_test_results, info_details, test_name, test_details)
# user facing reporting
if item:
item.info_details = info_details
return info_details
def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
run_time = dict(item.user_properties).get("run_time")
info_details = getattr(item, "info_details", {})
test_name = getattr(item, "test_name", "")
if info_details and test_name:
if run_time is not None:
cost = None
if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
print("Getting cost from Helicone")
cost = get_data_from_helicone(test_name)
info_details["metrics"]["cost"] = cost
if info_details["metrics"].get("success", None) is None:
info_details["metrics"]["attempted"] = False
info_details["metrics"]["success"] = False
elif (
info_details["metrics"].get("success") is False
and "attempted" not in info_details["metrics"]
):
info_details["metrics"]["attempted"] = False
info_details["metrics"]["run_time"] = f"{str(round(run_time, 3))} seconds"
info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
if "--mock" not in sys.argv:
update_challenges_already_beaten(info_details, test_name)
if info_details.get("tests") is not None:
for nested_test_name, nested_test_info in info_details[
"tests"
].items():
update_challenges_already_beaten(
nested_test_info, nested_test_name
)
SingletonReportManager().INFO_MANAGER.add_test(test_name, info_details)
def update_challenges_already_beaten(
info_details: Dict[str, Any], test_name: str
) -> None:
current_run_successful = info_details["metrics"]["success"]
try:
with open(CHALLENGES_ALREADY_BEATEN, "r") as f:
challenge_data = json.load(f)
except:
challenge_data = {}
challenge_beaten_in_the_past = challenge_data.get(test_name)
challenge_data[test_name] = True
if challenge_beaten_in_the_past is None and not current_run_successful:
challenge_data[test_name] = False
with open(CHALLENGES_ALREADY_BEATEN, "w") as f:
json.dump(challenge_data, f, indent=4)
def session_finish(suite_reports: dict) -> None:
agent_benchmark_config = get_agent_benchmark_config()
SingletonReportManager().INTERNAL_INFO_MANAGER.save()
SingletonReportManager().INFO_MANAGER.end_info_report(agent_benchmark_config)
SingletonReportManager().REGRESSION_MANAGER.save()
|