aboutsummaryrefslogtreecommitdiff
path: root/benchmark/reports/mini-agi/12.4_TestDebugSimpleTypoWithGuidance.json
diff options
context:
space:
mode:
Diffstat (limited to 'benchmark/reports/mini-agi/12.4_TestDebugSimpleTypoWithGuidance.json')
-rw-r--r--benchmark/reports/mini-agi/12.4_TestDebugSimpleTypoWithGuidance.json30
1 files changed, 30 insertions, 0 deletions
diff --git a/benchmark/reports/mini-agi/12.4_TestDebugSimpleTypoWithGuidance.json b/benchmark/reports/mini-agi/12.4_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..67efb7944
--- /dev/null
+++ b/benchmark/reports/mini-agi/12.4_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,30 @@
+{
+ "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+ "completion_time": "2023-07-18-08:36",
+ "metrics": {
+ "run_time": "63.23 seconds",
+ "highest_difficulty": "novice: 3"
+ },
+ "tests": {
+ "TestDebugSimpleTypoWithGuidance": {
+ "data_path": "agbenchmark/challenges/code/d2_debug",
+ "is_regression": true,
+ "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+ "answer": "[0, 1] [2, 5] [0, 3]",
+ "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+ "metrics": {
+ "difficulty": "novice",
+ "success": true,
+ "success_%": 100.0,
+ "run_time": "62.866 seconds"
+ },
+ "reached_cutoff": false
+ }
+ },
+ "config": {
+ "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+ },
+ "additional": {
+ "model": "gpt-4"
+ }
+}