diff options
Diffstat (limited to 'benchmark/reports/match_records.py')
-rw-r--r-- | benchmark/reports/match_records.py | 329 |
1 files changed, 329 insertions, 0 deletions
diff --git a/benchmark/reports/match_records.py b/benchmark/reports/match_records.py new file mode 100644 index 000000000..251fa68c6 --- /dev/null +++ b/benchmark/reports/match_records.py @@ -0,0 +1,329 @@ +import glob +import json +import os +from typing import Dict, List, Optional, Union + +import pandas as pd +from gql import Client, gql +from gql.transport.aiohttp import AIOHTTPTransport +from pydantic import BaseModel, Field + +# from agbenchmark.reports.processing.report_types import Report, SuiteTest + + +class Metrics(BaseModel): + difficulty: str + success: bool + success_percent: float = Field(..., alias="success_%") + run_time: Optional[str] = None + fail_reason: Optional[str] = None + attempted: Optional[bool] = None + + +class MetricsOverall(BaseModel): + run_time: str + highest_difficulty: str + percentage: Optional[float] = None + + +class Test(BaseModel): + data_path: str + is_regression: bool + answer: str + description: str + metrics: Metrics + category: List[str] + task: Optional[str] = None + reached_cutoff: Optional[bool] = None + + +class SuiteTest(BaseModel): + data_path: str + metrics: MetricsOverall + tests: Dict[str, Test] + category: Optional[List[str]] = None + task: Optional[str] = None + reached_cutoff: Optional[bool] = None + + +class Report(BaseModel): + command: str + completion_time: str + benchmark_start_time: str + metrics: MetricsOverall + tests: Dict[str, Union[Test, SuiteTest]] + config: Dict[str, str | dict[str, str]] + + +def get_reports(): + # Initialize an empty list to store the report data + report_data = [] + + # Get the current working directory + current_dir = os.getcwd() + + # Check if the current directory ends with 'reports' + if current_dir.endswith("reports"): + reports_dir = "/" + else: + reports_dir = "reports" + + # Iterate over all agent directories in the reports directory + for agent_name in os.listdir(reports_dir): + if agent_name is None: + continue + agent_dir = os.path.join(reports_dir, agent_name) + + # Check if the item is a directory (an agent directory) + if os.path.isdir(agent_dir): + # Construct the path to the report.json file + # Get all directories and files, but note that this will also include any file, not just directories. + run_dirs = glob.glob(os.path.join(agent_dir, "*")) + + # Get all json files starting with 'file' + # old_report_files = glob.glob(os.path.join(agent_dir, "file*.json")) + + # For each run directory, add the report.json to the end + # Only include the path if it's actually a directory + report_files = [ + os.path.join(run_dir, "report.json") + for run_dir in run_dirs + if os.path.isdir(run_dir) + ] + # old_report_files already contains the full paths, so no need to join again + # report_files = report_files + old_report_files + for report_file in report_files: + # Check if the report.json file exists + if os.path.isfile(report_file): + # Open the report.json file + with open(report_file, "r") as f: + # Load the JSON data from the file + json_data = json.load(f) + print(f"Processing {report_file}") + report = Report.parse_obj(json_data) + + for test_name, test_data in report.tests.items(): + test_json = { + "agent": agent_name.lower(), + "benchmark_start_time": report.benchmark_start_time, + } + + if isinstance(test_data, SuiteTest): + if ( + test_data.category + ): # this means it's a same task test + test_json["challenge"] = test_name + test_json["attempted"] = test_data.tests[ + list(test_data.tests.keys())[0] + ].metrics.attempted + test_json["categories"] = ", ".join( + test_data.category + ) + test_json["task"] = test_data.task + test_json["success"] = test_data.metrics.percentage + test_json[ + "difficulty" + ] = test_data.metrics.highest_difficulty + test_json[ + "success_%" + ] = test_data.metrics.percentage + test_json["run_time"] = test_data.metrics.run_time + test_json["is_regression"] = test_data.tests[ + list(test_data.tests.keys())[0] + ].is_regression + else: # separate tasks in 1 suite + for ( + suite_test_name, + suite_data, + ) in test_data.tests.items(): + test_json["challenge"] = suite_test_name + test_json[ + "attempted" + ] = suite_data.metrics.attempted + test_json["categories"] = ", ".join( + suite_data.category + ) + test_json["task"] = suite_data.task + test_json["success"] = ( + 100.0 if suite_data.metrics.success else 0 + ) + test_json[ + "difficulty" + ] = suite_data.metrics.difficulty + test_json[ + "success_%" + ] = suite_data.metrics.success_percentage + test_json[ + "run_time" + ] = suite_data.metrics.run_time + test_json[ + "is_regression" + ] = suite_data.is_regression + + else: + test_json["challenge"] = test_name + test_json["attempted"] = test_data.metrics.attempted + test_json["categories"] = ", ".join(test_data.category) + test_json["task"] = test_data.task + test_json["success"] = ( + 100.0 if test_data.metrics.success else 0 + ) + test_json["difficulty"] = test_data.metrics.difficulty + test_json[ + "success_%" + ] = test_data.metrics.success_percentage + test_json["run_time"] = test_data.metrics.run_time + test_json["is_regression"] = test_data.is_regression + + report_data.append(test_json) + + return pd.DataFrame(report_data) + + +def get_helicone_data(): + helicone_api_key = os.getenv("HELICONE_API_KEY") + + url = "https://www.helicone.ai/api/graphql" + # Replace <KEY> with your personal access key + transport = AIOHTTPTransport( + url=url, headers={"authorization": f"Bearer {helicone_api_key}"} + ) + + client = Client(transport=transport, fetch_schema_from_transport=True) + + SIZE = 250 + + i = 0 + + data = [] + print("Fetching data from Helicone") + while True: + query = gql( + """ + query ExampleQuery($limit: Int, $offset: Int){ + heliconeRequest( + limit: $limit + offset: $offset + ) { + costUSD + prompt + properties{ + name + value + } + + requestBody + response + createdAt + + } + + } + """ + ) + print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records") + try: + result = client.execute( + query, variable_values={"limit": SIZE, "offset": i * SIZE} + ) + except Exception as e: + print(f"Error occurred: {e}") + result = None + + i += 1 + + if result: + for item in result["heliconeRequest"]: + properties = { + prop["name"]: prop["value"] for prop in item["properties"] + } + data.append( + { + "createdAt": item["createdAt"], + "agent": properties.get("agent"), + "costUSD": item["costUSD"], + "job_id": properties.get("job_id"), + "challenge": properties.get("challenge"), + "benchmark_start_time": properties.get("benchmark_start_time"), + "prompt": item["prompt"], + "response": item["response"], + "model": item["requestBody"].get("model"), + "request": item["requestBody"].get("messages"), + } + ) + + if not result or (len(result["heliconeRequest"]) == 0): + print("No more results") + break + + df = pd.DataFrame(data) + # Drop rows where agent is None + df = df.dropna(subset=["agent"]) + + # Convert the remaining agent names to lowercase + df["agent"] = df["agent"].str.lower() + + return df + + +if os.path.exists("raw_reports.pkl") and os.path.exists("raw_helicone.pkl"): + reports_df = pd.read_pickle("raw_reports.pkl") + helicone_df = pd.read_pickle("raw_helicone.pkl") +else: + reports_df = get_reports() + reports_df.to_pickle("raw_reports.pkl") + helicone_df = get_helicone_data() + helicone_df.to_pickle("raw_helicone.pkl") + + +def try_formats(date_str): + formats = ["%Y-%m-%d-%H:%M", "%Y-%m-%dT%H:%M:%S%z"] + for fmt in formats: + try: + return pd.to_datetime(date_str, format=fmt) + except ValueError: + pass + return None + + +helicone_df["benchmark_start_time"] = pd.to_datetime( + helicone_df["benchmark_start_time"].apply(try_formats), utc=True +) +helicone_df = helicone_df.dropna(subset=["benchmark_start_time"]) +helicone_df["createdAt"] = pd.to_datetime( + helicone_df["createdAt"], unit="ms", origin="unix" +) +reports_df["benchmark_start_time"] = pd.to_datetime( + reports_df["benchmark_start_time"].apply(try_formats), utc=True +) +reports_df = reports_df.dropna(subset=["benchmark_start_time"]) + +assert pd.api.types.is_datetime64_any_dtype( + helicone_df["benchmark_start_time"] +), "benchmark_start_time in helicone_df is not datetime" +assert pd.api.types.is_datetime64_any_dtype( + reports_df["benchmark_start_time"] +), "benchmark_start_time in reports_df is not datetime" + +reports_df["report_time"] = reports_df["benchmark_start_time"] + +# df = pd.merge_asof( +# helicone_df.sort_values("benchmark_start_time"), +# reports_df.sort_values("benchmark_start_time"), +# left_on="benchmark_start_time", +# right_on="benchmark_start_time", +# by=["agent", "challenge"], +# direction="backward", +# ) + +df = pd.merge( + helicone_df, + reports_df, + on=["benchmark_start_time", "agent", "challenge"], + how="inner", +) + +df.to_pickle("df.pkl") +print(df.info()) +print("Data saved to df.pkl") +print("To load the data use: df = pd.read_pickle('df.pkl')") |