benchmark/agbenchmark/app.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334

import datetime
import glob
import json
import logging
import sys
import time
import uuid
from collections import deque
from multiprocessing import Process
from pathlib import Path
from typing import Optional

import httpx
import psutil
from agent_protocol_client import AgentApi, ApiClient, ApiException, Configuration
from agent_protocol_client.models import Task, TaskRequestBody
from fastapi import APIRouter, FastAPI, HTTPException, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Extra, ValidationError

from agbenchmark.challenges import ChallengeInfo
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.processing.report_types_v2 import (
    BenchmarkRun,
    Metrics,
    RepositoryInfo,
    RunDetails,
    TaskInfo,
)
from agbenchmark.schema import TaskEvalRequestBody
from agbenchmark.utils.utils import write_pretty_json

sys.path.append(str(Path(__file__).parent.parent))

logger = logging.getLogger(__name__)

CHALLENGES: dict[str, ChallengeInfo] = {}
challenges_path = Path(__file__).parent / "challenges"
challenge_spec_files = deque(
    glob.glob(
        f"{challenges_path}/**/data.json",
        recursive=True,
    )
)

logger.debug("Loading challenges...")
while challenge_spec_files:
    challenge_spec_file = Path(challenge_spec_files.popleft())
    challenge_relpath = challenge_spec_file.relative_to(challenges_path.parent)
    if challenge_relpath.is_relative_to("challenges/deprecated"):
        continue

    logger.debug(f"Loading {challenge_relpath}...")
    try:
        challenge_info = ChallengeInfo.parse_file(challenge_spec_file)
    except ValidationError as e:
        if logging.getLogger().level == logging.DEBUG:
            logger.warning(f"Spec file {challenge_relpath} failed to load:\n{e}")
        logger.debug(f"Invalid challenge spec: {challenge_spec_file.read_text()}")
        continue
    challenge_info.spec_file = challenge_spec_file

    if not challenge_info.eval_id:
        challenge_info.eval_id = str(uuid.uuid4())
        # this will sort all the keys of the JSON systematically
        # so that the order is always the same
        write_pretty_json(challenge_info.dict(), challenge_spec_file)

    CHALLENGES[challenge_info.eval_id] = challenge_info


class BenchmarkTaskInfo(BaseModel):
    task_id: str
    start_time: datetime.datetime
    challenge_info: ChallengeInfo


task_informations: dict[str, BenchmarkTaskInfo] = {}


def find_agbenchmark_without_uvicorn():
    pids = []
    for process in psutil.process_iter(
        attrs=[
            "pid",
            "cmdline",
            "name",
            "username",
            "status",
            "cpu_percent",
            "memory_info",
            "create_time",
            "cwd",
            "connections",
        ]
    ):
        try:
            # Convert the process.info dictionary values to strings and concatenate them
            full_info = " ".join([str(v) for k, v in process.as_dict().items()])

            if "agbenchmark" in full_info and "uvicorn" not in full_info:
                pids.append(process.pid)
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    return pids


class CreateReportRequest(BaseModel):
    test: str = None
    test_run_id: str = None
    # category: Optional[str] = []
    mock: Optional[bool] = False

    class Config:
        extra = Extra.forbid  # this will forbid any extra fields


updates_list = []

origins = [
    "http://localhost:8000",
    "http://localhost:8080",
    "http://127.0.0.1:5000",
    "http://localhost:5000",
]


def stream_output(pipe):
    for line in pipe:
        print(line, end="")


def setup_fastapi_app(agbenchmark_config: AgentBenchmarkConfig) -> FastAPI:
    from agbenchmark.agent_api_interface import upload_artifacts
    from agbenchmark.challenges import get_challenge_from_source_uri
    from agbenchmark.main import run_benchmark

    configuration = Configuration(
        host=agbenchmark_config.host or "http://localhost:8000"
    )
    app = FastAPI()
    app.add_middleware(
        CORSMiddleware,
        allow_origins=origins,
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    router = APIRouter()

    @router.post("/reports")
    def run_single_test(body: CreateReportRequest) -> dict:
        pids = find_agbenchmark_without_uvicorn()
        logger.info(f"pids already running with agbenchmark: {pids}")

        logger.debug(f"Request to /reports: {body.dict()}")

        # Start the benchmark in a separate thread
        benchmark_process = Process(
            target=lambda: run_benchmark(
                config=agbenchmark_config,
                tests=(body.test,),
                mock=body.mock or False,
            )
        )
        benchmark_process.start()

        # Wait for the benchmark to finish, with a timeout of 200 seconds
        timeout = 200
        start_time = time.time()
        while benchmark_process.is_alive():
            if time.time() - start_time > timeout:
                logger.warning(f"Benchmark run timed out after {timeout} seconds")
                benchmark_process.terminate()
                break
            time.sleep(1)
        else:
            logger.debug(f"Benchmark finished running in {time.time() - start_time} s")

        # List all folders in the current working directory
        path_reports = agbenchmark_config.reports_folder
        folders = [folder for folder in path_reports.iterdir() if folder.is_dir()]

        # Sort the folders based on their names
        sorted_folders = sorted(folders, key=lambda x: x.name)

        # Get the last folder
        latest_folder = sorted_folders[-1] if sorted_folders else None

        # Read report.json from this folder
        if latest_folder:
            report_path = latest_folder / "report.json"
            logger.debug(f"Getting latest report from {report_path}")
            if report_path.exists():
                with report_path.open() as file:
                    data = json.load(file)
                logger.debug(f"Report data: {data}")
            else:
                logger.error(
                    "Could not get result after running benchmark: "
                    f"'report.json' does not exist in '{latest_folder}'"
                )
        else:
            logger.error(
                "Could not get result after running benchmark: no reports found"
            )

        return data

    @router.post("/agent/tasks", tags=["agent"])
    async def create_agent_task(task_eval_request: TaskEvalRequestBody) -> Task:
        """
        Creates a new task using the provided TaskEvalRequestBody and returns a Task.

        Args:
            task_eval_request: `TaskRequestBody` including an eval_id.

        Returns:
            Task: A new task with task_id, input, additional_input,
                and empty lists for artifacts and steps.

        Example:
            Request (TaskEvalRequestBody defined in schema.py):
                {
                    ...,
                    "eval_id": "50da533e-3904-4401-8a07-c49adf88b5eb"
                }

            Response (Task defined in `agent_protocol_client.models`):
                {
                    "task_id": "50da533e-3904-4401-8a07-c49adf88b5eb",
                    "input": "Write the word 'Washington' to a .txt file",
                    "artifacts": []
                }
        """
        try:
            challenge_info = CHALLENGES[task_eval_request.eval_id]
            async with ApiClient(configuration) as api_client:
                api_instance = AgentApi(api_client)
                task_input = challenge_info.task

                task_request_body = TaskRequestBody(input=task_input)
                task_response = await api_instance.create_agent_task(
                    task_request_body=task_request_body
                )
                task_info = BenchmarkTaskInfo(
                    task_id=task_response.task_id,
                    start_time=datetime.datetime.now(datetime.timezone.utc),
                    challenge_info=challenge_info,
                )
                task_informations[task_info.task_id] = task_info

                if input_artifacts_dir := challenge_info.task_artifacts_dir:
                    await upload_artifacts(
                        api_instance,
                        input_artifacts_dir,
                        task_response.task_id,
                        "artifacts_in",
                    )
                return task_response
        except ApiException as e:
            logger.error(f"Error whilst trying to create a task:\n{e}")
            logger.error(
                "The above error was caused while processing request: "
                f"{task_eval_request}"
            )
            raise HTTPException(500)

    @router.post("/agent/tasks/{task_id}/steps")
    async def proxy(request: Request, task_id: str):
        timeout = httpx.Timeout(300.0, read=300.0)  # 5 minutes
        async with httpx.AsyncClient(timeout=timeout) as client:
            # Construct the new URL
            new_url = f"{configuration.host}/ap/v1/agent/tasks/{task_id}/steps"

            # Forward the request
            response = await client.post(
                new_url,
                data=await request.body(),
                headers=dict(request.headers),
            )

            # Return the response from the forwarded request
            return Response(content=response.content, status_code=response.status_code)

    @router.post("/agent/tasks/{task_id}/evaluations")
    async def create_evaluation(task_id: str) -> BenchmarkRun:
        task_info = task_informations[task_id]
        challenge = get_challenge_from_source_uri(task_info.challenge_info.source_uri)
        try:
            async with ApiClient(configuration) as api_client:
                api_instance = AgentApi(api_client)
                eval_results = await challenge.evaluate_task_state(
                    api_instance, task_id
                )

            eval_info = BenchmarkRun(
                repository_info=RepositoryInfo(),
                run_details=RunDetails(
                    command=f"agbenchmark --test={challenge.info.name}",
                    benchmark_start_time=(
                        task_info.start_time.strftime("%Y-%m-%dT%H:%M:%S+00:00")
                    ),
                    test_name=challenge.info.name,
                ),
                task_info=TaskInfo(
                    data_path=challenge.info.source_uri,
                    is_regression=None,
                    category=[c.value for c in challenge.info.category],
                    task=challenge.info.task,
                    answer=challenge.info.reference_answer or "",
                    description=challenge.info.description or "",
                ),
                metrics=Metrics(
                    success=all(e.passed for e in eval_results),
                    success_percentage=(
                        100 * sum(e.score for e in eval_results) / len(eval_results)
                        if eval_results  # avoid division by 0
                        else 0
                    ),
                    attempted=True,
                ),
                config={},
            )

            logger.debug(f"Returning evaluation data:\n{eval_info.json(indent=4)}")
            return eval_info
        except ApiException as e:
            logger.error(f"Error {e} whilst trying to evaluate task: {task_id}")
            raise HTTPException(500)

    app.include_router(router, prefix="/ap/v1")

    return app