aboutsummaryrefslogtreecommitdiff
path: root/benchmark/agbenchmark/utils/data_types.py
diff options
context:
space:
mode:
Diffstat (limited to 'benchmark/agbenchmark/utils/data_types.py')
-rw-r--r--benchmark/agbenchmark/utils/data_types.py46
1 files changed, 46 insertions, 0 deletions
diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py
new file mode 100644
index 000000000..688209682
--- /dev/null
+++ b/benchmark/agbenchmark/utils/data_types.py
@@ -0,0 +1,46 @@
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class DifficultyLevel(Enum):
+ interface = "interface"
+ basic = "basic"
+ novice = "novice"
+ intermediate = "intermediate"
+ advanced = "advanced"
+ expert = "expert"
+ human = "human"
+
+
+# map from enum to difficulty level (numeric)
+DIFFICULTY_MAP = {
+ DifficultyLevel.interface: 1,
+ DifficultyLevel.basic: 2,
+ DifficultyLevel.novice: 3,
+ DifficultyLevel.intermediate: 4,
+ DifficultyLevel.advanced: 5,
+ DifficultyLevel.expert: 6,
+ DifficultyLevel.human: 7,
+}
+
+STRING_DIFFICULTY_MAP = {e.value: DIFFICULTY_MAP[e] for e in DifficultyLevel}
+
+
+class Category(str, Enum):
+ DATA = "data"
+ GENERALIST = "general"
+ CODING = "coding"
+ SCRAPE_SYNTHESIZE = "scrape_synthesize"
+ WEB = "web"
+ GAIA_1 = "GAIA_1"
+ GAIA_2 = "GAIA_2"
+ GAIA_3 = "GAIA_3"
+
+
+class EvalResult(BaseModel):
+ result: str
+ result_source: Literal["step_output"] | str
+ score: float
+ passed: bool