diff options
Diffstat (limited to 'benchmark/agbenchmark/utils/data_types.py')
-rw-r--r-- | benchmark/agbenchmark/utils/data_types.py | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py new file mode 100644 index 000000000..688209682 --- /dev/null +++ b/benchmark/agbenchmark/utils/data_types.py @@ -0,0 +1,46 @@ +from enum import Enum +from typing import Literal + +from pydantic import BaseModel + + +class DifficultyLevel(Enum): + interface = "interface" + basic = "basic" + novice = "novice" + intermediate = "intermediate" + advanced = "advanced" + expert = "expert" + human = "human" + + +# map from enum to difficulty level (numeric) +DIFFICULTY_MAP = { + DifficultyLevel.interface: 1, + DifficultyLevel.basic: 2, + DifficultyLevel.novice: 3, + DifficultyLevel.intermediate: 4, + DifficultyLevel.advanced: 5, + DifficultyLevel.expert: 6, + DifficultyLevel.human: 7, +} + +STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel} + + +class Category(str, Enum): + DATA = "data" + GENERALIST = "general" + CODING = "coding" + SCRAPE_SYNTHESIZE = "scrape_synthesize" + WEB = "web" + GAIA_1 = "GAIA_1" + GAIA_2 = "GAIA_2" + GAIA_3 = "GAIA_3" + + +class EvalResult(BaseModel): + result: str + result_source: Literal["step_output"] | str + score: float + passed: bool |