benchmark/agbenchmark/reports/reports.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

import json
import logging
import os
from pathlib import Path

import pytest

from agbenchmark.challenges import ChallengeInfo
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.processing.report_types import Test, TestMetrics, TestResult
from agbenchmark.reports.ReportManager import SingletonReportManager
from agbenchmark.utils.data_types import DifficultyLevel

# from agbenchmark.utils.get_data_from_helicone import get_data_from_helicone

logger = logging.getLogger(__name__)


def get_and_update_success_history(
    test_name: str, success: bool | None
) -> list[bool | None]:
    mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv

    prev_test_results = SingletonReportManager().SUCCESS_RATE_TRACKER.tests.get(
        test_name, []
    )

    if not mock:
        # only add if it's an actual test
        prev_test_results.append(success)
        SingletonReportManager().SUCCESS_RATE_TRACKER.update(
            test_name, prev_test_results
        )

    return prev_test_results


def update_regression_tests(
    prev_test_results: list[bool | None],
    test_report: Test,
    test_name: str,
) -> None:
    if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
        # if the last 3 tests were successful, add to the regression tests
        test_report.metrics.is_regression = True
        SingletonReportManager().REGRESSION_MANAGER.add_test(
            test_name, test_report.dict(include={"difficulty", "data_path"})
        )


def make_empty_test_report(
    challenge_info: ChallengeInfo,
) -> Test:
    difficulty = challenge_info.difficulty
    if isinstance(difficulty, DifficultyLevel):
        difficulty = difficulty.value

    return Test(
        category=[c.value for c in challenge_info.category],
        difficulty=difficulty,
        data_path=challenge_info.source_uri,
        description=challenge_info.description or "",
        task=challenge_info.task,
        answer=challenge_info.reference_answer or "",
        metrics=TestMetrics(attempted=False, is_regression=False),
        results=[],
    )


def add_test_result_to_report(
    test_report: Test,
    item: pytest.Item,
    call: pytest.CallInfo,
    config: AgentBenchmarkConfig,
) -> None:
    user_properties: dict = dict(item.user_properties)
    test_name: str = user_properties.get("test_name", "")

    mock = os.getenv("IS_MOCK")  # Check if --mock is in sys.argv

    if call.excinfo:
        if not mock:
            SingletonReportManager().REGRESSION_MANAGER.remove_test(test_name)

        test_report.metrics.attempted = call.excinfo.typename != "Skipped"
    else:
        test_report.metrics.attempted = True

    test_report.results.append(
        TestResult(
            success=call.excinfo is None,
            run_time=f"{str(round(call.duration, 3))} seconds",
            fail_reason=str(call.excinfo.value) if call.excinfo else None,
            reached_cutoff=user_properties.get("timed_out", False),
        )
    )
    test_report.metrics.success_percentage = (
        sum(r.success or False for r in test_report.results)
        / len(test_report.results)
        * 100
    )

    prev_test_results: list[bool | None] = get_and_update_success_history(
        test_name, test_report.results[-1].success
    )

    update_regression_tests(prev_test_results, test_report, test_name)

    if test_report and test_name:
        # if "--mock" not in sys.argv and os.environ.get("HELICONE_API_KEY"):
        #     logger.debug("Getting cost from Helicone")
        #     test_report.metrics.cost = get_data_from_helicone(test_name)
        #     logger.debug(f"Cost: {cost}")

        if not mock:
            update_challenges_already_beaten(
                config.challenges_already_beaten_file, test_report, test_name
            )

        SingletonReportManager().INFO_MANAGER.add_test_report(test_name, test_report)


def update_challenges_already_beaten(
    challenges_already_beaten_file: Path, test_report: Test, test_name: str
) -> None:
    current_run_successful = any(r.success for r in test_report.results)
    try:
        with open(challenges_already_beaten_file, "r") as f:
            challenges_beaten_before = json.load(f)
    except FileNotFoundError:
        challenges_beaten_before = {}

    has_ever_been_beaten = challenges_beaten_before.get(test_name)
    challenges_beaten_before[test_name] = has_ever_been_beaten or current_run_successful

    with open(challenges_already_beaten_file, "w") as f:
        json.dump(challenges_beaten_before, f, indent=4)


def session_finish(agbenchmark_config: AgentBenchmarkConfig) -> None:
    SingletonReportManager().INFO_MANAGER.finalize_session_report(agbenchmark_config)
    SingletonReportManager().REGRESSION_MANAGER.save()
    SingletonReportManager().SUCCESS_RATE_TRACKER.save()