From 8b0579a87c08305e01e0fae9a3a2f79c45870a3d Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Mon, 29 Jan 2024 11:33:42 +0100 Subject: feat(benchmark): Add `-P`, `--parallel-tasks` option to allow running multiple tasks concurrently * Add dependency `pytest-parallel` and indirect dependency `py` (pylib) * Make `SingletonReportManager` thread safe --- benchmark/agbenchmark/__main__.py | 6 +++ benchmark/agbenchmark/main.py | 4 ++ benchmark/agbenchmark/reports/ReportManager.py | 57 +++++++++++++++----------- benchmark/poetry.lock | 39 +++++++++++++++++- benchmark/pyproject.toml | 2 + 5 files changed, 82 insertions(+), 26 deletions(-) diff --git a/benchmark/agbenchmark/__main__.py b/benchmark/agbenchmark/__main__.py index 9fff53523..b531b4518 100644 --- a/benchmark/agbenchmark/__main__.py +++ b/benchmark/agbenchmark/__main__.py @@ -62,6 +62,9 @@ def start(): @click.option( "-N", "--attempts", default=1, help="Number of times to run each challenge." ) +@click.option( + "-P", "--parallel-tasks", default=1, help="Number of challenges to run in parallel." +) @click.option( "-c", "--category", @@ -111,6 +114,7 @@ def run( category: tuple[str], skip_category: tuple[str], attempts: int, + parallel_tasks: int, cutoff: Optional[int] = None, backend: Optional[bool] = False, # agent_path: Optional[Path] = None, @@ -158,6 +162,7 @@ def run( categories=category, skip_categories=skip_category, attempts_per_challenge=attempts, + concurrent_tasks=parallel_tasks, cutoff=cutoff, ) @@ -177,6 +182,7 @@ def run( categories=category, skip_categories=skip_category, attempts_per_challenge=attempts, + concurrent_tasks=parallel_tasks, cutoff=cutoff, ) diff --git a/benchmark/agbenchmark/main.py b/benchmark/agbenchmark/main.py index 4cd97bd89..4128a0a26 100644 --- a/benchmark/agbenchmark/main.py +++ b/benchmark/agbenchmark/main.py @@ -22,6 +22,7 @@ def run_benchmark( categories: tuple[str] = tuple(), skip_categories: tuple[str] = tuple(), attempts_per_challenge: int = 1, + concurrent_tasks: int = 1, mock: bool = False, no_dep: bool = False, no_cutoff: bool = False, @@ -100,6 +101,9 @@ def run_benchmark( if attempts_per_challenge > 1: pytest_args.append(f"--attempts={attempts_per_challenge}") + if concurrent_tasks > 1: + pytest_args.append(f"--tests-per-worker={concurrent_tasks}") + if cutoff: pytest_args.append(f"--cutoff={cutoff}") logger.debug(f"Setting cuttoff override to {cutoff} seconds.") diff --git a/benchmark/agbenchmark/reports/ReportManager.py b/benchmark/agbenchmark/reports/ReportManager.py index d04beee43..5d1392bbc 100644 --- a/benchmark/agbenchmark/reports/ReportManager.py +++ b/benchmark/agbenchmark/reports/ReportManager.py @@ -3,10 +3,11 @@ import json import logging import os import sys +import threading import time from datetime import datetime, timezone from pathlib import Path -from typing import Any +from typing import Any, ClassVar from agbenchmark.config import AgentBenchmarkConfig from agbenchmark.reports.processing.graphs import save_single_radar_chart @@ -20,39 +21,39 @@ logger = logging.getLogger(__name__) class SingletonReportManager: - instance = None + _instance = None + _lock: ClassVar[threading.Lock] = threading.Lock() INFO_MANAGER: "SessionReportManager" REGRESSION_MANAGER: "RegressionTestsTracker" SUCCESS_RATE_TRACKER: "SuccessRatesTracker" def __new__(cls): - if not cls.instance: - cls.instance = super(SingletonReportManager, cls).__new__(cls) - - agent_benchmark_config = AgentBenchmarkConfig.load() - benchmark_start_time_dt = datetime.now( - timezone.utc - ) # or any logic to fetch the datetime - - # Make the Managers class attributes - cls.INFO_MANAGER = SessionReportManager( - agent_benchmark_config.get_report_dir(benchmark_start_time_dt) - / "report.json", - benchmark_start_time_dt, - ) - cls.REGRESSION_MANAGER = RegressionTestsTracker( - agent_benchmark_config.regression_tests_file - ) - cls.SUCCESS_RATE_TRACKER = SuccessRatesTracker( - agent_benchmark_config.success_rate_file - ) - - return cls.instance + with cls._lock: + if not cls._instance: + cls._instance = super(SingletonReportManager, cls).__new__(cls) + + agent_benchmark_config = AgentBenchmarkConfig.load() + benchmark_start_time_dt = datetime.now(timezone.utc) + + # Make the Managers class attributes + cls.INFO_MANAGER = SessionReportManager( + agent_benchmark_config.get_report_dir(benchmark_start_time_dt) + / "report.json", + benchmark_start_time_dt, + ) + cls.REGRESSION_MANAGER = RegressionTestsTracker( + agent_benchmark_config.regression_tests_file + ) + cls.SUCCESS_RATE_TRACKER = SuccessRatesTracker( + agent_benchmark_config.success_rate_file + ) + + return cls._instance @classmethod def clear_instance(cls): - cls.instance = None + cls._instance = None cls.INFO_MANAGER = None cls.REGRESSION_MANAGER = None cls.SUCCESS_RATE_TRACKER = None @@ -131,6 +132,12 @@ class SessionReportManager(BaseReportManager): self.save() + def get_test_report(self, test_name: str) -> Test | None: + if isinstance(self.tests, Report): + return self.tests.tests.get(test_name) + else: + return self.tests.get(test_name) + def finalize_session_report(self, config: AgentBenchmarkConfig) -> None: command = " ".join(sys.argv) diff --git a/benchmark/poetry.lock b/benchmark/poetry.lock index 057b89aa4..5bdfbe089 100644 --- a/benchmark/poetry.lock +++ b/benchmark/poetry.lock @@ -1946,6 +1946,17 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, + {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, +] + [[package]] name = "pyasn1" version = "0.5.1" @@ -2137,6 +2148,21 @@ pytest = ">=7.0.0" docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1.0)"] testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] +[[package]] +name = "pytest-parallel" +version = "0.1.1" +description = "a pytest plugin for parallel and concurrent testing" +optional = false +python-versions = "*" +files = [ + {file = "pytest-parallel-0.1.1.tar.gz", hash = "sha256:9aac3fc199a168c0a8559b60249d9eb254de7af58c12cee0310b54d4affdbfab"}, + {file = "pytest_parallel-0.1.1-py3-none-any.whl", hash = "sha256:9e3703015b0eda52be9e07d2ba3498f09340a56d5c79a39b50f22fc5c38212fe"}, +] + +[package.dependencies] +pytest = ">=3.0.0" +tblib = "*" + [[package]] name = "python-dateutil" version = "2.8.2" @@ -2431,6 +2457,17 @@ anyio = ">=3.4.0,<5" [package.extras] full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] +[[package]] +name = "tblib" +version = "3.0.0" +description = "Traceback serialization library." +optional = false +python-versions = ">=3.8" +files = [ + {file = "tblib-3.0.0-py3-none-any.whl", hash = "sha256:80a6c77e59b55e83911e1e607c649836a69c103963c5f28a46cbeef44acf8129"}, + {file = "tblib-3.0.0.tar.gz", hash = "sha256:93622790a0a29e04f0346458face1e144dc4d32f493714c6c3dff82a4adb77e6"}, +] + [[package]] name = "toml" version = "0.10.2" @@ -2760,4 +2797,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "e0d1f991958a5d630287c7bb668e7fdc6183630e06196cf6f507a086be10baec" +content-hash = "4a4e53f252c8996b172bbb35a730197c07c53d7b50bf1d21964d3b2237495066" diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index 6740004b4..b8bf8ccab 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -25,7 +25,9 @@ networkx = "^3.1" colorama = "^0.4.6" pyvis = "^0.3.2" selenium = "^4.11.2" +py = "^1.11.0" # needed for pytest-parallel pytest-asyncio = "^0.21.1" +pytest-parallel = "^0.1.1" uvicorn = "^0.23.2" fastapi = "^0.99.0" python-multipart = "^0.0.6" -- cgit v1.2.3