diff options
Diffstat (limited to 'benchmark/agbenchmark/challenges/deprecated/d2.1_guided')
7 files changed, 122 insertions, 0 deletions
diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py new file mode 100644 index 000000000..df8120bfa --- /dev/null +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/sample_code.py @@ -0,0 +1,13 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + typo + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py new file mode 100644 index 000000000..c273ee793 --- /dev/null +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_in/test.py @@ -0,0 +1,32 @@ +# mypy: ignore-errors +from typing import List + +from sample_code import two_sum + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/__init__.py diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py new file mode 100644 index 000000000..de3d8c62c --- /dev/null +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/sample_code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py new file mode 100644 index 000000000..c273ee793 --- /dev/null +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/artifacts_out/test.py @@ -0,0 +1,32 @@ +# mypy: ignore-errors +from typing import List + +from sample_code import two_sum + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json new file mode 100644 index 000000000..0ef6f3dd3 --- /dev/null +++ b/benchmark/agbenchmark/challenges/deprecated/d2.1_guided/data.json @@ -0,0 +1,33 @@ +{ + "category": [ + "code", + "iterate" + ], + "cutoff": 75, + "dependencies": [ + "TestReadFile" + ], + "eval_id": "e3794d05-7617-4b95-9e96-d5f84309f66f", + "ground": { + "answer": "[0, 1] [2, 5] [0, 3]", + "eval": { + "type": "python" + }, + "files": [ + "test.py" + ], + "should_contain": [ + "[0, 1]", + "[2, 5]", + "[0, 3]" + ], + "should_not_contain": [] + }, + "info": { + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "difficulty": "novice", + "side_effects": [] + }, + "name": "DebugSimpleTypoWithGuidance", + "task": "1- Run test.py.\n2- Read sample_code.py.\n3- Modify sample_code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n" +} |