From 0b899eb4cfce5084abec3c25342ddf2e097dc1ac Mon Sep 17 00:00:00 2001 From: Toran Bruce Richards Date: Thu, 6 Apr 2023 13:59:45 +0100 Subject: Initial commit --- .gitignore | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ LICENSE | 21 ++++++++++ README.md | 2 + 3 files changed, 152 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..b6e47617d --- /dev/null +++ b/.gitignore @@ -0,0 +1,129 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..601935b85 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Toran Bruce Richards + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 000000000..0120d4fca --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Auto-GPT-Benchmarks +A set of standardised benchmarks to assess the performance of Auto-GPTs. -- cgit v1.2.3 From 89081d942c077190d9aa89b0b88cbcc03162da2c Mon Sep 17 00:00:00 2001 From: douglas Date: Mon, 17 Apr 2023 17:22:31 -0400 Subject: First commit for AutoGPT Benchmarks --- .gitmodules | 3 + README.md | 38 ++++++++++ auto_gpt_benchmarking/Auto-GPT | 1 + auto_gpt_benchmarking/AutoGPTAgent.py | 88 ++++++++++++++++++++++ auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml | 8 ++ auto_gpt_benchmarking/CompletionFn.py | 27 +++++++ auto_gpt_benchmarking/LangChainCompletions.py | 34 +++++++++ auto_gpt_benchmarking/__init__.py | 0 .../completion_fns/auto_gpt_completion_fn.yaml | 2 + auto_gpt_benchmarking/main.py | 4 + requirements.txt | 1 + 11 files changed, 206 insertions(+) create mode 100644 .gitmodules create mode 160000 auto_gpt_benchmarking/Auto-GPT create mode 100644 auto_gpt_benchmarking/AutoGPTAgent.py create mode 100644 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml create mode 100644 auto_gpt_benchmarking/CompletionFn.py create mode 100644 auto_gpt_benchmarking/LangChainCompletions.py create mode 100644 auto_gpt_benchmarking/__init__.py create mode 100644 auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml create mode 100644 auto_gpt_benchmarking/main.py create mode 100644 requirements.txt diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..d293ba9c4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "Auto-GPT"] + path = auto_gpt_benchmarking/Auto-GPT + url = https://github.com/Significant-Gravitas/Auto-GPT.git diff --git a/README.md b/README.md index 0120d4fca..75db145a2 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ # Auto-GPT-Benchmarks A set of standardised benchmarks to assess the performance of Auto-GPTs. + +# What is next? + +- [ ] Build longer form tasks, (code fix backed by testing) +- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project +- [ ] Switch to a ubuntu container so it can do more things (git, bash, etc) +- [ ] Lower priority, but put this in a webserver backend so we have a good API +- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used +- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. + + +## Understanding OpenAI Evals + +The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs + +The basic idea is this though: +1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test. +2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir. +3. Run the evals against the completion function. + +Then you can make more also, yaml defined evals and run them against the completion function as needed. + +### Completions Functions + +See our yaml file in `completion_fns` dir for the registration of the completion function. +See our completion function itself in CompletionFn.py +That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py + + +# RANDOM SHIT + +You must add the auto_gpt_bencchmarking dir to the python path +Do this with a path file in your venv. OpenAI evals needs to import it. + +I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: +`/home/douglas/AGI/Auto-GPT-Benchmarks-fork` + + diff --git a/auto_gpt_benchmarking/Auto-GPT b/auto_gpt_benchmarking/Auto-GPT new file mode 160000 index 000000000..97d62cc16 --- /dev/null +++ b/auto_gpt_benchmarking/Auto-GPT @@ -0,0 +1 @@ +Subproject commit 97d62cc16bf45fcd406efeb33d042ebd58c24670 diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py new file mode 100644 index 000000000..f24b150b4 --- /dev/null +++ b/auto_gpt_benchmarking/AutoGPTAgent.py @@ -0,0 +1,88 @@ +""" +This instantiates an AutoGPT agent who is capable of handling any task. +It is designed to pass benchmarks as effectively as possible. + +Loads in the ai_settings.yaml file to get the AI's name, role, and goals. +Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation. + +The model is instantiated with a prompt from the AutoGPT completion function. + +Eventualy we will also save and log all of the associated output and thinking for the model as well +""" +from pathlib import Path +import os + + +class AutoGPTAgent: + """ + A class object that contains the configuration information for the AI + The init function takes an evaluation prompt. + It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo. + It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt + It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder. + If the model has used more than 50,000 tokens, it kills the model. + If the model has used less than 50,000 tokens, it returns the output.txt file. + """ + def _clean_up_workspace(self): + """ + Cleans up the workspace by deleting the prompt.txt and output.txt files. + :return: + """ + # check if the files are there and delete them if they are + if self.prompt_file.exists(): + self.prompt_file.unlink() + if self.output_file.exists(): + self.output_file.unlink() + + def _copy_ai_settings(self): + self.ai_settings_dest.write_text(self.ai_settings_file.read_text()) + + def _copy_prompt(self): + self.prompt_file.write_text(self.prompt) + + def _start_agent(self): + """ + This starts the agent in the docker container. + This assumes you have the docker image built with: + docker build -t autogpt . + In the dockerfile in the Auto-GPT repo. + You also must set up the .env file in the Auto-GPT repo. + :return: + """ + env_file = self.auto_gpt_path / ".env" + # run it in continuous mode and skip re-prompts + os.system(f"docker run -it --env-file={env_file} -v {self.auto_workspace}:/home/appuser/auto_gpt_workspace -v {self.auto_gpt_path}/autogpt:/home/appuser/autogpt autogpt --continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'") + + def _poll_for_output(self): + """ + This polls the output file to see if the model has finished. + :return: + """ + while True: + if self.output_file.exists(): + return self.output_file.read_text() + + def __init__(self, prompt): + self.auto_gpt_path = Path(__file__).parent / "Auto-GPT" + self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace" + self.prompt_file = self.auto_workspace / "prompt.txt" + self.output_file = self.auto_workspace / "output.txt" + self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml" + self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml" + self.prompt = prompt + self._clean_up_workspace() + self._copy_ai_settings() + self._copy_prompt() + + def start(self): + self._start_agent() + answer = self._poll_for_output() + print('about to do clean up') + print(answer) + self._clean_up_workspace() + print('did clean up') + return answer + + + + diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml new file mode 100644 index 000000000..b7cc573d5 --- /dev/null +++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml @@ -0,0 +1,8 @@ +ai_goals: +- Evaluate the prompt in `prompt.txt` +- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided. +- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer. +- Save your work in the `output.txt` file, the second you do this, exit the program. +- Exit the program when you are done. +ai_name: EvaluationAgent +ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py new file mode 100644 index 000000000..9bb4bb32b --- /dev/null +++ b/auto_gpt_benchmarking/CompletionFn.py @@ -0,0 +1,27 @@ +import importlib +from typing import Optional +from evals.api import CompletionFn, CompletionResult + +from evals.prompt.base import CompletionPrompt +from evals.record import record_sampling +from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent + + +class AutoGPTCompletionResult(CompletionResult): + def __init__(self, response) -> None: + self.response = response + + def get_completions(self) -> list[str]: + return [self.response.strip()] + + +class AutoGPTCompletionFn(CompletionFn): + def __init__(self, **kwargs) -> None: + pass + + def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult: + prompt = CompletionPrompt(prompt).to_formatted_prompt() + agent = AutoGPTAgent(prompt) + response = agent.start() + record_sampling(prompt=prompt, sampled=response) + return AutoGPTCompletionResult(response) \ No newline at end of file diff --git a/auto_gpt_benchmarking/LangChainCompletions.py b/auto_gpt_benchmarking/LangChainCompletions.py new file mode 100644 index 000000000..17f52bfa1 --- /dev/null +++ b/auto_gpt_benchmarking/LangChainCompletions.py @@ -0,0 +1,34 @@ +import importlib +from typing import Optional +from evals.api import CompletionFn, CompletionResult + +from langchain.llms import BaseLLM + +from evals.prompt.base import CompletionPrompt +from evals.record import record_sampling + + +class LangChainLLMCompletionResult(CompletionResult): + def __init__(self, response) -> None: + self.response = response + + def get_completions(self) -> list[str]: + return [self.response.strip()] + + +class LangChainLLMCompletionFn(CompletionFn): + def __init__(self, llm: str, llm_kwargs: Optional[dict] = {}, **kwargs) -> None: + # Import and resolve self.llm to an instance of llm argument here, assuming it's always a subclass of BaseLLM + module = importlib.import_module("langchain.llms") + LLMClass = getattr(module, llm) + + if issubclass(LLMClass, BaseLLM): + self.llm = LLMClass(**llm_kwargs) + else: + raise ValueError(f"{llm} is not a subclass of BaseLLM") + + def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult: + prompt = CompletionPrompt(prompt).to_formatted_prompt() + response = self.llm(prompt) + record_sampling(prompt=prompt, sampled=response) + return LangChainLLMCompletionResult(response) diff --git a/auto_gpt_benchmarking/__init__.py b/auto_gpt_benchmarking/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml new file mode 100644 index 000000000..d6a55a29b --- /dev/null +++ b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml @@ -0,0 +1,2 @@ +auto_gpt_completion_fn: + class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn \ No newline at end of file diff --git a/auto_gpt_benchmarking/main.py b/auto_gpt_benchmarking/main.py new file mode 100644 index 000000000..f0303f1e7 --- /dev/null +++ b/auto_gpt_benchmarking/main.py @@ -0,0 +1,4 @@ +""" +To run auto-gpt we need to run the following command: + +""" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..a59bcbdd3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +evals \ No newline at end of file -- cgit v1.2.3 From 7212c3876d9c23c52893788462ac744e80853329 Mon Sep 17 00:00:00 2001 From: douglas Date: Mon, 17 Apr 2023 17:34:45 -0400 Subject: Cleanup --- README.md | 1 + auto_gpt_benchmarking/main.py | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) delete mode 100644 auto_gpt_benchmarking/main.py diff --git a/README.md b/README.md index 75db145a2..db3c5e3ac 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs. - [ ] Lower priority, but put this in a webserver backend so we have a good API - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. +- [ ] Figure our how the OpenAI Evals results are saved... ## Understanding OpenAI Evals diff --git a/auto_gpt_benchmarking/main.py b/auto_gpt_benchmarking/main.py deleted file mode 100644 index f0303f1e7..000000000 --- a/auto_gpt_benchmarking/main.py +++ /dev/null @@ -1,4 +0,0 @@ -""" -To run auto-gpt we need to run the following command: - -""" \ No newline at end of file -- cgit v1.2.3 From 59ff485253225dc7902cc506369ded9457dfed64 Mon Sep 17 00:00:00 2001 From: douglas Date: Mon, 17 Apr 2023 18:14:09 -0400 Subject: Prompt engineering fixes --- README.md | 39 ++++++++++++++++++++-- auto_gpt_benchmarking/AutoGPTAgent.py | 3 ++ auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml | 6 ++-- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index db3c5e3ac..b8f09a94c 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs. - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. - [ ] Figure our how the OpenAI Evals results are saved... +- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't. +- [ ] Make the file logger/duplicate op checker more robust. It's not great right now. ## Understanding OpenAI Evals @@ -30,12 +32,43 @@ See our completion function itself in CompletionFn.py That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py -# RANDOM SHIT +## Setup You must add the auto_gpt_bencchmarking dir to the python path Do this with a path file in your venv. OpenAI evals needs to import it. -I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: -`/home/douglas/AGI/Auto-GPT-Benchmarks-fork` +Create a venv with +`python3.9 -m venv venv` + +Activate it with + +`source venv/bin/activate` + +Add a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: +`/PATH/TO/REPO/Auto-GPT-Benchmarks-fork` + +This is because evals tries to import it directly. + +Install the requirements with + +`pip install -r requirements.txt` + +## Running the tests + +EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-match --registry_path $PWD/auto_gpt_benchmarking + + +# Example final output: + +~/AGI/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl +{"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}} +{"final_report": {"accuracy": 0.3333333333333333}} +{"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 1, "sample_id": "test-match.s1.2", "type": "match", "data": {"correct": false, "expected": "15", "picked": null, "sampled": "OpenAI was founded in 2015.2015", "options": ["15"]}, "created_by": "", "created_at": "2023-04-17 22:10:13.127550+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 2, "sample_id": "test-match.s1.1", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: The first US president was \nAssistant: ", "sampled": "George Washington"}, "created_by": "", "created_at": "2023-04-17 22:11:17.761693+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 3, "sample_id": "test-match.s1.1", "type": "match", "data": {"correct": true, "expected": "George Washington", "picked": "George Washington", "sampled": "George Washington", "options": ["George Washington"]}, "created_by": "", "created_at": "2023-04-17 22:11:17.761739+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 4, "sample_id": "test-match.s1.0", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: Once upon a \nAssistant: ", "sampled": "Once upon a time"}, "created_by": "", "created_at": "2023-04-17 22:12:04.691026+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"} +(venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py index f24b150b4..097311c73 100644 --- a/auto_gpt_benchmarking/AutoGPTAgent.py +++ b/auto_gpt_benchmarking/AutoGPTAgent.py @@ -33,6 +33,8 @@ class AutoGPTAgent: self.prompt_file.unlink() if self.output_file.exists(): self.output_file.unlink() + if self.file_logger.exists(): + self.file_logger.unlink() def _copy_ai_settings(self): self.ai_settings_dest.write_text(self.ai_settings_file.read_text()) @@ -67,6 +69,7 @@ class AutoGPTAgent: self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace" self.prompt_file = self.auto_workspace / "prompt.txt" self.output_file = self.auto_workspace / "output.txt" + self.file_logger = self.auto_workspace / "file_logger.txt" self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml" self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml" self.prompt = prompt diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml index b7cc573d5..ec995a666 100644 --- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml +++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml @@ -1,8 +1,6 @@ ai_goals: -- Evaluate the prompt in `prompt.txt` -- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided. +- Evaluate the prompt in `prompt.txt` and find the best answer in the format provided. - Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer. -- Save your work in the `output.txt` file, the second you do this, exit the program. -- Exit the program when you are done. +- Save the final answer and output to the `output.txt` file, the only file you should write to then immediately exit the program. ai_name: EvaluationAgent ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible -- cgit v1.2.3 From 3b0091c2314f61e71246c1609bb1fb0607c85b58 Mon Sep 17 00:00:00 2001 From: Ambuj Pawar Date: Tue, 18 Apr 2023 09:25:25 +0200 Subject: Typo in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b8f09a94c..871f17b76 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ That points to the AutoGPT model we want to test which is spun up dynamically in ## Setup -You must add the auto_gpt_bencchmarking dir to the python path +You must add the auto_gpt_benchmarking dir to the python path Do this with a path file in your venv. OpenAI evals needs to import it. Create a venv with -- cgit v1.2.3 From 2fbb03dc6c1df3ca1fae2549c3aa9c0a1d86aea6 Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Tue, 18 Apr 2023 10:27:47 -0400 Subject: Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 871f17b76..123c87e8b 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,12 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs. The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs -The basic idea is this though: +The basic idea is this: 1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test. 2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir. 3. Run the evals against the completion function. -Then you can make more also, yaml defined evals and run them against the completion function as needed. +Then you can make more yaml defined evals and run them against the completion function as needed. ### Completions Functions @@ -61,7 +61,7 @@ EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-mat # Example final output: -~/AGI/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl +/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl {"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}} {"final_report": {"accuracy": 0.3333333333333333}} {"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"} -- cgit v1.2.3 From dad4804b4e53f4aab4f2615345d4638719399da1 Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Tue, 18 Apr 2023 10:29:05 -0400 Subject: Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 123c87e8b..f3b54648b 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs. - [ ] Build longer form tasks, (code fix backed by testing) - [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project - [ ] Switch to a ubuntu container so it can do more things (git, bash, etc) -- [ ] Lower priority, but put this in a webserver backend so we have a good API +- [ ] Lower priority, but put this in a webserver backend so we have a good API rather than doing container and file management for our interface between evals and our agent. - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. - [ ] Figure our how the OpenAI Evals results are saved... -- cgit v1.2.3 From 486c7e3a5ea1a92472945ae6d42a855bd4191239 Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Tue, 18 Apr 2023 11:10:24 -0400 Subject: Update README.md Adding set up info --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index f3b54648b..26aec3d3a 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,14 @@ Install the requirements with `pip install -r requirements.txt` +You must have a docker container built corresponding to the submodule below or the docker run command starting the agent will fail. + +Cd into the AutoGPT submodule and build/tag the dockerfile so the agent can be instantiated. +`cd auto_gpt_benchmarks/Auto-GPT` + +Build the container so we can run it procedurally! +`docker build -t autogpt .` + ## Running the tests EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-match --registry_path $PWD/auto_gpt_benchmarking -- cgit v1.2.3 From f00ced6612896c0489eb83017777bc3e3652cc33 Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Tue, 18 Apr 2023 11:59:42 -0400 Subject: Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 26aec3d3a..52eb94454 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,8 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs. - [ ] Lower priority, but put this in a webserver backend so we have a good API rather than doing container and file management for our interface between evals and our agent. - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. -- [ ] Figure our how the OpenAI Evals results are saved... +- [ ] Copy the OpenAI Eval files from the tmp file they are saved to somewhere we can track the results - [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't. -- [ ] Make the file logger/duplicate op checker more robust. It's not great right now. ## Understanding OpenAI Evals -- cgit v1.2.3 From 625d6e72ecc1ba0336199e4cefbb41d409acf2d1 Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Thu, 20 Apr 2023 15:41:29 -0400 Subject: Remove the submodule, reference OpenAI directly rather than running it on the command line, fix logging (#16) * Removed submodule, refactor, docker on pip, async docker logging, running our own tool on CLI rather than OpenAIs --- .gitignore | 2 + .gitmodules | 3 - README.md | 117 +++++++++++------ auto_gpt_benchmarking/Auto-GPT | 1 - auto_gpt_benchmarking/AutoGPTAgent.py | 80 ++++++++++-- auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml | 2 +- auto_gpt_benchmarking/CompletionFn.py | 21 ++- auto_gpt_benchmarking/Evaluator.py | 61 +++++++++ auto_gpt_benchmarking/LangChainCompletions.py | 34 ----- auto_gpt_benchmarking/__main__.py | 144 +++++++++++++++++++++ .../completion_fns/auto_gpt_completion_fn.yaml | 4 +- requirements.txt | 82 +++++++++++- 12 files changed, 452 insertions(+), 99 deletions(-) delete mode 160000 auto_gpt_benchmarking/Auto-GPT create mode 100644 auto_gpt_benchmarking/Evaluator.py delete mode 100644 auto_gpt_benchmarking/LangChainCompletions.py create mode 100644 auto_gpt_benchmarking/__main__.py diff --git a/.gitignore b/.gitignore index b6e47617d..e68877ae9 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,5 @@ dmypy.json # Pyre type checker .pyre/ + +/data diff --git a/.gitmodules b/.gitmodules index d293ba9c4..e69de29bb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "Auto-GPT"] - path = auto_gpt_benchmarking/Auto-GPT - url = https://github.com/Significant-Gravitas/Auto-GPT.git diff --git a/README.md b/README.md index 52eb94454..5a75d5a33 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,97 @@ # Auto-GPT-Benchmarks -A set of standardised benchmarks to assess the performance of Auto-GPTs. +A set of standardised benchmarks to assess the performance of Auto-GPT. +This currently uses the OpenAI Evals framework to run the benchmarks. -# What is next? +## Setup -- [ ] Build longer form tasks, (code fix backed by testing) -- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project -- [ ] Switch to a ubuntu container so it can do more things (git, bash, etc) -- [ ] Lower priority, but put this in a webserver backend so we have a good API rather than doing container and file management for our interface between evals and our agent. -- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used -- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. -- [ ] Copy the OpenAI Eval files from the tmp file they are saved to somewhere we can track the results -- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't. +You must add the auto_gpt_benchmarking dir to the python path +Do this with a path file in your venv. OpenAI evals needs to import it. +These instructions currently assume ubuntuy 22.04. +They should be fairly adaptable to the windows/MacOS equivalents. Please submit a PR if you would like to see your OS +documented. -## Understanding OpenAI Evals +Clone the repo with: -The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs + `git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git` + `cd Auto-GPT-Benchmarks` -The basic idea is this: -1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test. -2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir. -3. Run the evals against the completion function. +Create a venv with -Then you can make more yaml defined evals and run them against the completion function as needed. + `python3.9 -m venv venv` -### Completions Functions -See our yaml file in `completion_fns` dir for the registration of the completion function. -See our completion function itself in CompletionFn.py -That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py +Activate it with + `source venv/bin/activate` -## Setup +Install the requirements with: -You must add the auto_gpt_benchmarking dir to the python path -Do this with a path file in your venv. OpenAI evals needs to import it. + `pip install -r requirements.txt` -Create a venv with +If you haven't already clone the AutoGPT repo somewhere else on your machine. +DO NOT CLONE IT INTO A SUBDIR OF THIS REPO. -`python3.9 -m venv venv` + `cd somewhere/else` + `git clone git@github.com:Significant-Gravitas/Auto-GPT.git` -Activate it with +You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at: + + `Auto-GPT/.env` -`source venv/bin/activate` +Finally, we assume you have a docker container built from the Dockerfile in the Auto-GPT repo. -Add a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: -`/PATH/TO/REPO/Auto-GPT-Benchmarks-fork` +Build this with: -This is because evals tries to import it directly. + `cd Auto-GPT` + `docker build -t autogpt .` -Install the requirements with +If you want to run with redis as your memory system, you can stand up a redis image in the AutoGPT repo with + + `docker compose up` -`pip install -r requirements.txt` +Then you will need to adjust some variables in your .env file to use the redis memory backend. +See the AutoGPT docs on how to do that. -You must have a docker container built corresponding to the submodule below or the docker run command starting the agent will fail. +Run your first eval with: -Cd into the AutoGPT submodule and build/tag the dockerfile so the agent can be instantiated. -`cd auto_gpt_benchmarks/Auto-GPT` + `cd Auto-GPT-Benchmarks` + `python3 -m auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT` -Build the container so we can run it procedurally! -`docker build -t autogpt .` +You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in -## Running the tests + `auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml`. -EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-match --registry_path $PWD/auto_gpt_benchmarking +To see a full list of available flags you can use run `python3 -m auto_gpt_benchmarking --help` +Some of these are inherited from the openAI evals framework and do not work quite as intended as they are not applicable +to this use case. + +This saves a file in `Auto-GPT-Benchmarks/data/records.jsonl` +This file is currently a default that is configurable with --record_path flag. You will have to specify the fully +qualified path. + +## Currently Supported Benchmarks: +From OpenAI Evals +- [x] test-match +- [x] test-fuzzy-match +- [ ] Everything else they have... + +## Understanding OpenAI Evals + +The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs + +The basic idea is this though: +1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test. +2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir. +3. Run the evals against the completion function. + +Then you can make more also, yaml defined evals and run them against the completion function as needed. + +### Completions Functions + +See our yaml file in `completion_fns` dir for the registration of the completion function. +See our completion function itself in CompletionFn.py +That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py # Example final output: @@ -79,3 +107,12 @@ EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-mat {"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"} (venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ +# What is next? + +- [ ] Run the rest of the OpenAI Evals Especially the modelgraded ones +- [ ] Build longer form tasks, (code fix backed by testing) +- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project +- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used +- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. +- [ ] Figure our how the OpenAI Evals results are saved... +- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't. diff --git a/auto_gpt_benchmarking/Auto-GPT b/auto_gpt_benchmarking/Auto-GPT deleted file mode 160000 index 97d62cc16..000000000 --- a/auto_gpt_benchmarking/Auto-GPT +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 97d62cc16bf45fcd406efeb33d042ebd58c24670 diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py index 097311c73..63cebf1cb 100644 --- a/auto_gpt_benchmarking/AutoGPTAgent.py +++ b/auto_gpt_benchmarking/AutoGPTAgent.py @@ -10,7 +10,9 @@ The model is instantiated with a prompt from the AutoGPT completion function. Eventualy we will also save and log all of the associated output and thinking for the model as well """ from pathlib import Path -import os +import docker +import asyncio +import aiodocker class AutoGPTAgent: @@ -36,12 +38,34 @@ class AutoGPTAgent: if self.file_logger.exists(): self.file_logger.unlink() - def _copy_ai_settings(self): + def _copy_ai_settings(self) -> None: self.ai_settings_dest.write_text(self.ai_settings_file.read_text()) - def _copy_prompt(self): + def _copy_prompt(self) -> None: self.prompt_file.write_text(self.prompt) + async def _stream_logs(self, container: aiodocker.containers.DockerContainer) -> None: + try: + async for line in container.log(stdout=True, stderr=True, follow=True, tail="all"): + print(line.strip()) + await asyncio.sleep(1) + except aiodocker.exceptions.DockerError as e: + # Handle Docker errors (e.g., container is killed or removed) + print('Docker error: {}'.format(e)) + + async def _run_stream_logs(self) -> None: + """ + This grabs the docker containers id and streams the logs to the console with aiodocker. + :return: None + """ + async with aiodocker.Docker() as docker_client: + try: + container = docker_client.containers.container(self.container.id) + await self._stream_logs(container) + except aiodocker.exceptions.DockerError as e: + # Handle cases when the container is not found + print('Container not found: {}'.format(e)) + def _start_agent(self): """ This starts the agent in the docker container. @@ -51,9 +75,26 @@ class AutoGPTAgent: You also must set up the .env file in the Auto-GPT repo. :return: """ + client = docker.from_env() env_file = self.auto_gpt_path / ".env" - # run it in continuous mode and skip re-prompts - os.system(f"docker run -it --env-file={env_file} -v {self.auto_workspace}:/home/appuser/auto_gpt_workspace -v {self.auto_gpt_path}/autogpt:/home/appuser/autogpt autogpt --continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'") + envs = [ + f"{line.strip()}" for line in open( + env_file + ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n"] + + self.container = client.containers.run( + image="autogpt", + command="--continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'", + environment=envs, + volumes={ + self.auto_workspace: {"bind": "/home/appuser/auto_gpt_workspace", "mode": "rw"}, + f"{self.auto_gpt_path}/autogpt": {"bind": "/home/appuser/autogpt", "mode": "rw"}, + }, + stdin_open=True, + tty=True, + detach=True + ) + asyncio.run(self._run_stream_logs()) def _poll_for_output(self): """ @@ -64,8 +105,8 @@ class AutoGPTAgent: if self.output_file.exists(): return self.output_file.read_text() - def __init__(self, prompt): - self.auto_gpt_path = Path(__file__).parent / "Auto-GPT" + def __init__(self, prompt, auto_gpt_path: str): + self.auto_gpt_path = Path(auto_gpt_path) self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace" self.prompt_file = self.auto_workspace / "prompt.txt" self.output_file = self.auto_workspace / "output.txt" @@ -76,16 +117,33 @@ class AutoGPTAgent: self._clean_up_workspace() self._copy_ai_settings() self._copy_prompt() + self.container = None + self.killing = False + self.logging_task = None def start(self): self._start_agent() answer = self._poll_for_output() - print('about to do clean up') - print(answer) - self._clean_up_workspace() - print('did clean up') + print(f"Prompt was: {self.prompt}, Answer was: {answer}") + self.kill() return answer + def kill(self): + if self.killing: + return + self.killing = True + self._clean_up_workspace() + if self.container: + # kill the container + try: + self.container.kill() + self.container.remove() + except docker.errors.APIError: + print('Couldn\'t find container to kill. Assuming container successfully killed itself.') + if self.logging_task: + self.logging_task.cancel() + self.killing = False + diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml index ec995a666..ab6caaed0 100644 --- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml +++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml @@ -1,6 +1,6 @@ ai_goals: - Evaluate the prompt in `prompt.txt` and find the best answer in the format provided. - Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer. -- Save the final answer and output to the `output.txt` file, the only file you should write to then immediately exit the program. +- Save the final answer and output to the `output.txt` file, the only file you should write to, then immediately exit the program because you are done. ai_name: EvaluationAgent ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py index 9bb4bb32b..f82ede85c 100644 --- a/auto_gpt_benchmarking/CompletionFn.py +++ b/auto_gpt_benchmarking/CompletionFn.py @@ -1,5 +1,3 @@ -import importlib -from typing import Optional from evals.api import CompletionFn, CompletionResult from evals.prompt.base import CompletionPrompt @@ -16,12 +14,21 @@ class AutoGPTCompletionResult(CompletionResult): class AutoGPTCompletionFn(CompletionFn): - def __init__(self, **kwargs) -> None: - pass + + def __init__(self, auto_gpt_path, **kwargs) -> None: + self.auto_gpt_path = auto_gpt_path + self.agent = None def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult: prompt = CompletionPrompt(prompt).to_formatted_prompt() - agent = AutoGPTAgent(prompt) - response = agent.start() + self.kill_agent() + self.agent = AutoGPTAgent(prompt, self.auto_gpt_path) + response = self.agent.start() record_sampling(prompt=prompt, sampled=response) - return AutoGPTCompletionResult(response) \ No newline at end of file + return AutoGPTCompletionResult(response) + + def kill_agent(self): + if self.agent: + self.agent.kill() + + diff --git a/auto_gpt_benchmarking/Evaluator.py b/auto_gpt_benchmarking/Evaluator.py new file mode 100644 index 000000000..4301fb3bc --- /dev/null +++ b/auto_gpt_benchmarking/Evaluator.py @@ -0,0 +1,61 @@ +""" +The evaluator class actually executes the evals. +""" +from evals.cli import oaieval +from evals.registry import Registry +from pathlib import Path +from typing import List, Optional, Tuple +import sys + + +class OAIRunArgs: + def __init__( + self, + completion_fn: str, + eval: str, + extra_eval_params: str = "", + max_samples: int = None, + cache: bool = True, + visible: bool = None, + seed: int = 20220722, + user: str = "", + record_path: str = None, + log_to_file: str = None, + debug: bool = False, + local_run: bool = True, + dry_run: bool = False, + dry_run_logging: bool = True, + ): + self.completion_fn = completion_fn + self.eval = eval + self.extra_eval_params = extra_eval_params + self.max_samples = max_samples + self.cache = cache + self.visible = visible + self.seed = seed + self.user = user + self.record_path = record_path + self.log_to_file = log_to_file + self.debug = debug + self.local_run = local_run + self.dry_run = dry_run + self.dry_run_logging = dry_run_logging + # create the record and logging paths if they don't exist + Path(self.record_path).parent.mkdir(parents=True, exist_ok=True) + # Path(self.log_to_file).parent.mkdir(parents=True, exist_ok=True) + # Registry path should be the auto_gpt_benchmarking folder + self.registry_path = None + + +class Evaluator: + def __init__(self, oai_run_args: OAIRunArgs): + self.oai_run_args = oai_run_args + registry_path = Path(__file__).parent + + # add registry path to the python system path + sys.path.append(str(registry_path)) + self.oai_run_args.registry_path = [registry_path] + # self.registry = Registry([registry_path]) + + def run(self): + oaieval.run(self.oai_run_args) diff --git a/auto_gpt_benchmarking/LangChainCompletions.py b/auto_gpt_benchmarking/LangChainCompletions.py deleted file mode 100644 index 17f52bfa1..000000000 --- a/auto_gpt_benchmarking/LangChainCompletions.py +++ /dev/null @@ -1,34 +0,0 @@ -import importlib -from typing import Optional -from evals.api import CompletionFn, CompletionResult - -from langchain.llms import BaseLLM - -from evals.prompt.base import CompletionPrompt -from evals.record import record_sampling - - -class LangChainLLMCompletionResult(CompletionResult): - def __init__(self, response) -> None: - self.response = response - - def get_completions(self) -> list[str]: - return [self.response.strip()] - - -class LangChainLLMCompletionFn(CompletionFn): - def __init__(self, llm: str, llm_kwargs: Optional[dict] = {}, **kwargs) -> None: - # Import and resolve self.llm to an instance of llm argument here, assuming it's always a subclass of BaseLLM - module = importlib.import_module("langchain.llms") - LLMClass = getattr(module, llm) - - if issubclass(LLMClass, BaseLLM): - self.llm = LLMClass(**llm_kwargs) - else: - raise ValueError(f"{llm} is not a subclass of BaseLLM") - - def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult: - prompt = CompletionPrompt(prompt).to_formatted_prompt() - response = self.llm(prompt) - record_sampling(prompt=prompt, sampled=response) - return LangChainLLMCompletionResult(response) diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py new file mode 100644 index 000000000..06f5145ce --- /dev/null +++ b/auto_gpt_benchmarking/__main__.py @@ -0,0 +1,144 @@ +""" +This is the main evaluation file. In it you can specify the following: + +1. The number of threads to use for evaluation. This is set to 1 by default.And will remain that way until we can spin + up containers on command +2. The timeout for each thread. This is set to 60 seconds by default. This is the amount of time each thread will run + for before it is killed when evaluating an agent +3. The path to the AutoGPT code. This is a required parameter as we do not know where your code lives. +4. The evals you would like to run. The options here are any OpenAI eval, or any of the evals defined in this repository + + +What this file does is it parses the params given and then runs the evals with OpenAI's evals framework. +""" + +import argparse +import os +import sys +from pathlib import Path +from datetime import datetime +import yaml + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("eval", type=str, help="Name of an eval. See registry.") + parser.add_argument( + "--completion-fn", + type=str, + dest="completion_fn", + default="auto_gpt_completion_fn", + help="One or more CompletionFn URLs, separated by commas (,). " + "A CompletionFn can either be the name of a model available in the OpenAI API or a key in the registry " + "(see evals/registry/completion_fns).", + ) + parser.add_argument( + "--timeout", + type=int, + default=300, + help="The timeout for each thread", + ) + parser.add_argument( + "--auto-gpt-path", + type=str, + default=None, + help="The path to the AutoGPT code. This updates auto_gpt_competion_fn.yaml in completion fns. " + "So you only need to set this once.", + ) + parser.add_argument("--extra_eval_params", type=str, default="") + parser.add_argument("--max_samples", type=int, default=None) + parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None) + parser.add_argument("--seed", type=int, default=20220722) + parser.add_argument("--user", type=str, default="") + parser.add_argument("--record_path", type=str, default=str(Path(__file__).parent.parent / "data" / "records.jsonl")) + parser.add_argument( + "--log_to_file", type=str, default=None,#default=str( + # Path(__file__).parent.parent / "data" / "log" / "log.txt" + # ), help="Log to a file instead of stdout" + ) + parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument("--local-run", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument("--dry-run", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument("--dry-run-logging", action=argparse.BooleanOptionalAction, default=True) + return parser.parse_args() + + +def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) -> Path: + """ + If there is a given auto_gpt_path, then we need to update the yaml file to include it in the system path + If we don't have one. Then we get the path from the yaml. + If none exists in the yaml and we don't have a path then we raise an exception. + :param yaml_path: The path to the yaml file + :param auto_gpt_path: The path to the AutoGPT code + :return: The path to the AutoGPT code + """ + with open(yaml_path, "r") as f: + yaml_data = yaml.safe_load(f) + if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None: + raise Exception("You must specify a auto_gpt_path in the yaml file or pass it in as a parameter") + if auto_gpt_path is None: + auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] + if auto_gpt_path is not None: + yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] = auto_gpt_path + with open(yaml_path, "w") as f: + yaml.safe_dump(yaml_data, f) + + return Path(auto_gpt_path).absolute() + + +def load_env_file(env_path: Path): + if not env_path.exists(): + raise FileNotFoundError('You must set the OpenAI key in the AutoGPT env file. ' + 'We need your api keys to start the AutoGPT agent and use OpenAI evals') + with open(env_path, "r") as f: + # find the OPENAI_API_KEY key split it from the equals sign and assign it so OpenAI evals can use it. + for line in f.readlines(): + if line.startswith("OPENAI_API_KEY"): + os.environ["OPENAI_API_KEY"] = line.split("=")[1].strip() + break + + +if __name__ == "__main__": + args = parse_args() + # do not run in multiprocessing mode We do not use this right now, as it disables OpenAI's timeouts :( + # os.environ["EVALS_SEQUENTIAL"] = "1" + os.environ["EVALS_THREAD_TIMEOUT"] = str(args.timeout) + os.environ["EVALS_THREADS"] = str(1) + + # Update the yaml file with the auto_gpt_path + autogpt_path = update_yaml_with_auto_gpt_path( + str(Path(__file__).parent / "completion_fns" / "auto_gpt_completion_fn.yaml"), + args.auto_gpt_path + ) + + # Add the benchmarks path to the system path so we can import auto_gpt_benchmarking + sys.path.append(str(Path(__file__).parent.parent.absolute())) + + # load all of the environment variables in the auto-gpt path/.env file + load_env_file(Path(autogpt_path) / ".env") + + # Obviously, a top level import would be better. This allows us to set the API key with the env file, as it gets + # set in the evaluator. We can't set it before the import because the import will fail without an API key. + from auto_gpt_benchmarking.Evaluator import Evaluator, OAIRunArgs + run_args = OAIRunArgs( + completion_fn=args.completion_fn, + eval=args.eval, + extra_eval_params=args.extra_eval_params, + max_samples=args.max_samples, + cache=args.cache, + visible=args.visible, + seed=args.seed, + user=args.user, + record_path=args.record_path, + log_to_file=args.log_to_file, + debug=args.debug, + local_run=args.local_run, + dry_run=args.dry_run, + dry_run_logging=args.dry_run_logging) + + # Run the evals + evaluator = Evaluator( + run_args + ) + evaluator.run() diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml index d6a55a29b..a101f000a 100644 --- a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml +++ b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml @@ -1,2 +1,4 @@ auto_gpt_completion_fn: - class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn \ No newline at end of file + args: + auto_gpt_path: + class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn diff --git a/requirements.txt b/requirements.txt index a59bcbdd3..b1c5914ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,81 @@ -evals \ No newline at end of file +aiodocker==0.21.0 +aiohttp==3.8.4 +aiosignal==1.3.1 +asn1crypto==1.5.1 +async-timeout==4.0.2 +attrs==23.1.0 +backoff==2.2.1 +blobfile==2.0.1 +cachetools==5.3.0 +certifi==2022.12.7 +cffi==1.15.1 +charset-normalizer==2.1.1 +click==8.1.3 +colorama==0.4.6 +contourpy==1.0.7 +cryptography==40.0.2 +cycler==0.11.0 +dataclasses-json==0.5.7 +docker==6.0.1 +evals==1.0.2.post1 +filelock==3.11.0 +fire==0.5.0 +fonttools==4.39.3 +frozenlist==1.3.3 +gptcache==0.1.13 +greenlet==2.0.2 +idna==3.4 +importlib-resources==5.12.0 +joblib==1.2.0 +kiwisolver==1.4.4 +langchain==0.0.142 +langdetect==1.0.9 +lxml==4.9.2 +lz4==4.3.2 +marshmallow==3.19.0 +marshmallow-enum==1.5.1 +matplotlib==3.7.1 +mock==5.0.2 +multidict==6.0.4 +mypy==1.2.0 +mypy-extensions==1.0.0 +nltk==3.8.1 +numexpr==2.8.4 +numpy==1.24.2 +openai==0.27.4 +openapi-schema-pydantic==1.2.4 +oscrypto==1.3.0 +packaging==23.1 +pandas==1.5.3 +Pillow==9.5.0 +portalocker==2.7.0 +pyarrow==10.0.1 +pycparser==2.21 +pycryptodomex==3.17 +pydantic==1.10.7 +PyJWT==2.6.0 +pyOpenSSL==23.1.1 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2023.3 +PyYAML==6.0 +pyzstd==0.15.6 +regex==2023.3.23 +requests==2.28.2 +sacrebleu==2.3.1 +setuptools-scm==7.1.0 +six==1.16.0 +snowflake-connector-python==3.0.2 +SQLAlchemy==1.4.47 +tabulate==0.9.0 +tenacity==8.2.2 +termcolor==2.2.0 +tiktoken==0.3.3 +tomli==2.0.1 +tqdm==4.65.0 +typing-inspect==0.8.0 +typing_extensions==4.5.0 +urllib3==1.26.15 +websocket-client==1.5.1 +yarl==1.8.2 +zipp==3.15.0 -- cgit v1.2.3 From 011ed2f2b97840921539dc385891ebf9f7701e78 Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Thu, 20 Apr 2023 15:47:15 -0400 Subject: Update README.md (#17) remove -m --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a75d5a33..fe8bb6b04 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ See the AutoGPT docs on how to do that. Run your first eval with: `cd Auto-GPT-Benchmarks` - `python3 -m auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT` + `python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT` You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in -- cgit v1.2.3 From ef5c4f8a11b23667860acf0e6689ec195d056bd2 Mon Sep 17 00:00:00 2001 From: Media <12145726+rihp@users.noreply.github.com> Date: Fri, 21 Apr 2023 01:04:34 +0200 Subject: Graphs for evals (#20) * Update README.md * Jupyter Notebook for evaluating eval results --------- Co-authored-by: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> --- auto_gpt_benchmarking/__main__.py | 38 ++++--- evals_analytics.ipynb | 220 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+), 13 deletions(-) create mode 100644 evals_analytics.ipynb diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py index 06f5145ce..c42c73b8e 100644 --- a/auto_gpt_benchmarking/__main__.py +++ b/auto_gpt_benchmarking/__main__.py @@ -18,11 +18,14 @@ import sys from pathlib import Path from datetime import datetime import yaml +from datetime import datetime + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument("eval", type=str, help="Name of an eval. See registry.") + parser.add_argument( + "eval", type=str, help="Name of an eval. See registry.") parser.add_argument( "--completion-fn", type=str, @@ -47,20 +50,27 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--extra_eval_params", type=str, default="") parser.add_argument("--max_samples", type=int, default=None) - parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True) - parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None) + parser.add_argument( + "--cache", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument( + "--visible", action=argparse.BooleanOptionalAction, default=None) parser.add_argument("--seed", type=int, default=20220722) parser.add_argument("--user", type=str, default="") - parser.add_argument("--record_path", type=str, default=str(Path(__file__).parent.parent / "data" / "records.jsonl")) + parser.add_argument("--record_path", type=str, default=str(Path( + __file__).parent.parent / "data" / f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl")) parser.add_argument( - "--log_to_file", type=str, default=None,#default=str( - # Path(__file__).parent.parent / "data" / "log" / "log.txt" - # ), help="Log to a file instead of stdout" + "--log_to_file", type=str, default=None, # default=str( + # Path(__file__).parent.parent / "data" / "log" / "log.txt" + # ), help="Log to a file instead of stdout" ) - parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument("--local-run", action=argparse.BooleanOptionalAction, default=True) - parser.add_argument("--dry-run", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument("--dry-run-logging", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument( + "--debug", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument( + "--local-run", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument( + "--dry-run", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument("--dry-run-logging", + action=argparse.BooleanOptionalAction, default=True) return parser.parse_args() @@ -76,7 +86,8 @@ def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) - with open(yaml_path, "r") as f: yaml_data = yaml.safe_load(f) if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None: - raise Exception("You must specify a auto_gpt_path in the yaml file or pass it in as a parameter") + raise Exception( + "You must specify a auto_gpt_path in the yaml file or pass it in as a parameter") if auto_gpt_path is None: auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] if auto_gpt_path is not None: @@ -108,7 +119,8 @@ if __name__ == "__main__": # Update the yaml file with the auto_gpt_path autogpt_path = update_yaml_with_auto_gpt_path( - str(Path(__file__).parent / "completion_fns" / "auto_gpt_completion_fn.yaml"), + str(Path(__file__).parent / "completion_fns" / + "auto_gpt_completion_fn.yaml"), args.auto_gpt_path ) diff --git a/evals_analytics.ipynb b/evals_analytics.ipynb new file mode 100644 index 000000000..f1b48424c --- /dev/null +++ b/evals_analytics.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAc5klEQVR4nO3deZgddZ3v8feHALLKYoLGrCiIoBDABlRQQBEiKsHHLVEUvGDmOuK43RnBmQcYnHH0ehVHwYHoRERlEREmwwQRRhEVkTSILEEkRiAJYBrCpiAY+Nw/6tfDSVOdPunu6pNOf17Pc56u+v1q+Vaf5Hy6llMl20RERPS1UacLiIiI9VMCIiIiaiUgIiKiVgIiIiJqJSAiIqJWAiIiImolICI6TNJrJN3e6Toi+kpAREdJukrSg5Ke0+lamiDpIEnLa9qvknQcgO2f2t6ljWWdIunbTdQZUScBER0jaTrwGsDAESO87o1Hcn3ru/w+ok4CIjrpfcC1wNnA0a0dkqZI+r6kHkkPSDq9pe8Dkm6T9KikxZL2Lu2WtFPLdGdL+qcyfJCk5ZI+Kek+4BuStpN0aVnHg2V4csv820v6hqR7Sv8lpf0WSW9pmW4TSfdL2mswv4S+exmlxhVl+26X9HpJM4FPAe+S9EdJvy7TvlDSAkmrJC2R9IGW5Wwu6Zul9tsk/V2f9dxZ1nUT8CdJG0s6QdLvWn63b22Z/hhJP5d0mqSHJC2V9OrSvkzSSklrvI8xuiUgopPeB3ynvA6T9HwASeOAS4G7gOnAJOD80vcO4JQy73Op9jweaHN9LwC2B6YBc6n+/X+jjE8FHgdOb5n+W8AWwMuAHYDTSvs5wFEt0x0O3Gv7V23W0S9JuwDHA/vY3ho4DLjT9g+AzwAX2N7K9owyy/nAcuCFwNuBz0h6Xek7mer39yLgDX1q7jUHeBOwre3VwO+o9uq2Af4R+LakiS3T7wfcBDwPOLesfx9gp7L80yVtNdTfQ6wnbOeV14i/gAOAvwDjy/hvgI+V4VcBPcDGNfNdDnykn2Ua2Kll/Gzgn8rwQcCTwGZrqWlP4MEyPBF4GtiuZroXAo8Czy3j3wP+rp9lHlSW81Cf12rguJZplpfhnYCVwCHAJn2WdQrw7ZbxKcBTwNYtbf8CnF2GlwKHtfQd17ueMn4n8L8GeJ9uBGaV4WOAO1r6di+/8+e3tD0A7Nnpf195Dc8rexDRKUcDP7R9fxk/l2cOM00B7nL1F21fU6j+yh2MHtt/7h2RtIWksyTdJekR4Gpg27IHMwVYZfvBvguxfQ/wc+BtkrYF3ki1F9Sfe2xv2/oCflY3oe0lwEepwmClpPMlvbCf5b6w1PhoS9tdVHtcvf3LWvpah2vbJL1P0o3lENJDwMuB8S2T/KFl+PFSc9+27EFsIBIQMeIkbQ68EzhQ0n3lnMDHgBmSZlB9aE3t58TpMuDF/Sz6MapDQr1e0Ke/762LPwHsAuxn+7nAa3tLLOvZvgRAnW9SHVJ5B/AL2yv6mW6d2T7X9gFUh74MfK6f+u8pNW7d0jYV6K3lXmByS9+UutX1DkiaBnyN6hDX80qQ3UL1+4gxKAERnXAk1aGR3agO6+wJ7Ar8lOrcwnVUH26flbSlpM0k7V/m/TrwfyS9QpWdygcbVIdD3i1pXDmpe+AAdWxN9RfvQ5K2pzpmD4Dte4HLgK+Wk9mbSHpty7yXAHsDH6E6JzEsJO0i6XWqLvv9c6nv6dL9B2C6pI1KjcuAa4B/Kb+jPYBjgd5LYb8LnFjqn0T1wb82W1IFRk+p5f1UexAxRiUgohOOBr5h+27b9/W+qE4Qv4fqL9a3UB2Pv5vqJOy7AGxfCPwz1SGpR6k+qLcvy/1Ime+hspxLBqjjS8DmwP1UV1P9oE//e6nOk/yG6rzAR3s7bD8OXATsCHy/7S0f2HOAz5aa7qM6OX5i6buw/HxA0g1leA7Vieh7gIuBk21fWfpOpfrd/R64kupcyRP9rdj2YuALwC+owmh3qkNpMUbJzgODIgZD0knAS2zXXR203pH0QWC27YH2rCKA7EFEDEo5JHUsMK/TtfRH0kRJ+0vaqFw++wmqvYyItiQgItZR+TLaMuAy21d3up612BQ4i+pQ3I+A/wC+2tGKYlTJIaaIiKiVPYiIiKi1Qd2ga/z48Z4+fXqny4iIGDWuv/76+21PqOvboAJi+vTpdHd3d7qMiIhRQ9Jd/fXlEFNERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStxgJC1TOFf1yea3urpI/UTCNJXy7P0r1J5dnCpe9oSXeUV55zGxExwpr8HsRq4BO2bygPNLle0hXllsK93gjsXF77Af8G7Ndyb/4uqvvTXy9pQd3TvSIiohmN7UHYvtf2DWX4UeA2nnkUYq9ZwDmuXEv1uMeJVA9qv8J27yMfrwBmNlVrREQ824icg5A0HdgL+GWfrkms+Uzc5aWtv/a6Zc+V1C2pu6enZ9hqjohYn02cPBVJSGLi5KmNrKPxgJC0FdWTtz5q+5HhXr7teba7bHdNmFB7O5GIiA3OfSuWMe2TlzLtk5dy34plA88wCI0GhKRNqMLhO7brHsu4gjUfpD65tPXXHhERI6TJq5gE/Dtwm+0v9jPZAuB95WqmVwIPl4fFXw4cWh62vh1waGmLiIgR0uRVTPtTPfT9Zkk3lrZPAVMBbJ8JLAQOB5YAjwHvL32rJH0aWFTmO9X2qgZrjYiIPhoLCNs/AzTANAY+1E/ffGB+A6VFREQb8k3qiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNPTBI0nzgzcBK2y+v6f9b4D0tdewKTChPk7sTeBR4Clhtu6upOiMiol6TexBnAzP767T9edt72t4TOBH4SZ/Hih5c+hMOEREd0FhA2L4aaPc50nOA85qqJSIi1l3Hz0FI2oJqT+OilmYDP5R0vaS5naksImJsa+wcxDp4C/DzPoeXDrC9QtIOwBWSflP2SJ6lBMhcgKlTpzZfbUTEGNHxPQhgNn0OL9leUX6uBC4G9u1vZtvzbHfZ7powYUKjhUZEjCUdDQhJ2wAHAv/R0ralpK17h4FDgVs6U2FExNjV5GWu5wEHAeMlLQdOBjYBsH1mmeytwA9t/6ll1ucDF0vqre9c2z9oqs6IiKjXWEDYntPGNGdTXQ7b2rYUmNFMVRER0a714RxERESshxIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUaCwhJ8yWtlFT7PGlJB0l6WNKN5XVSS99MSbdLWiLphKZqjIiI/jW5B3E2MHOAaX5qe8/yOhVA0jjgDOCNwG7AHEm7NVhnRETUaCwgbF8NrBrErPsCS2wvtf0kcD4wa1iLi4iIAXX6HMSrJP1a0mWSXlbaJgHLWqZZXtpqSZorqVtSd09PT5O1RkSMKZ0MiBuAabZnAF8BLhnMQmzPs91lu2vChAnDWV9ExJjWsYCw/YjtP5bhhcAmksYDK4ApLZNOLm0RETGCOhYQkl4gSWV431LLA8AiYGdJO0raFJgNLOhUnRERY9XGTS1Y0nnAQcB4ScuBk4FNAGyfCbwd+KCk1cDjwGzbBlZLOh64HBgHzLd9a1N1RkREvcYCwvacAfpPB07vp28hsLCJuiIioj2dvoopIiLWUwmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNBYSk+ZJWSrqln/73SLpJ0s2SrpE0o6XvztJ+o6TupmqMiIj+tRUQknYfxLLPBmaupf/3wIG2dwc+Dczr03+w7T1tdw1i3RERMUTt7kF8VdJ1kv5a0jbtzGD7amDVWvqvsf1gGb0WmNxmLRERMQLaCgjbrwHeA0wBrpd0rqQ3DGMdxwKXta4S+KGk6yXNXduMkuZK6pbU3dPTM4wlRUSMbRu3O6HtOyT9A9ANfBnYS5KAT9n+/mALkHQwVUAc0NJ8gO0VknYArpD0m7JHUlfXPMrhqa6uLg+2joiIWFO75yD2kHQacBvwOuAttnctw6cNduWS9gC+Dsyy/UBvu+0V5edK4GJg38GuIyIiBqfdcxBfAW4AZtj+kO0bAGzfA/zDYFYsaSrwfeC9tn/b0r6lpK17h4FDgdoroSIiojntHmJ6E/C47acAJG0EbGb7MdvfqptB0nnAQcB4ScuBk4FNAGyfCZwEPI/qBDjA6nLF0vOBi0vbxsC5tn8wuM2LiIjBajcgrgQOAf5YxrcAfgi8ur8ZbM9Z2wJtHwccV9O+FJjx7DkiImIktXuIaTPbveFAGd6imZIiImJ90G5A/EnS3r0jkl4BPN5MSRERsT5o9xDTR4ELJd0DCHgB8K6mioqIiM5rKyBsL5L0UmCX0nS77b80V1ZERHRa21+UA/YBppd59paE7XMaqSoiIjqurYCQ9C3gxcCNwFOl2UACIiJiA9XuHkQXsJvt3MoiImKMaPcqpluoTkxHRMQY0e4exHhgsaTrgCd6G20f0UhVERHRce0GxClNFhEREeufdi9z/YmkacDOtq+UtAUwrtnSIiKik9q93fcHgO8BZ5WmScAlDdUUERHrgXZPUn8I2B94BKqHBwE7NFVURER0XrsB8YTtJ3tHJG1M9T2IiIjYQLUbED+R9Clg8/Is6guB/2yurIiI6LR2A+IEoAe4GfgrYCGDfJJcRESMDu1exfQ08LXyioiIMaDdq5h+L2lp31cb882XtFJS7TOlVfmypCWSburzzImjJd1RXke3v0kRETEc1uVeTL02A94BbN/GfGcDp9P/Tf3eCOxcXvsB/wbsJ2l7qmdYd1GdDL9e0gLbD7ZZb0REDFFbexC2H2h5rbD9JeBNbcx3NbBqLZPMAs5x5VpgW0kTgcOAK2yvKqFwBTCznVojImJ4tHuIae+WV5ek/826PUuiP5OAZS3jy0tbf+11tc2V1C2pu6enZxhKinZMnDwVSUhi4uSpo275se7ynow97X7If6FleDVwJ/DOYa9mEGzPA+YBdHV15bsZI+S+FcuY9slLAbjrc28edcuPdZf3ZOxp9yqmgxta/wpgSsv45NK2AjioT/tVDdUQERE12n2i3MfX1m/7i4Nc/wLgeEnnU52kftj2vZIuBz4jabsy3aHAiYNcR0REDMK6XMW0D9UHOsBbgOuAO9Y2k6TzqPYExktaTnVl0iYAts+k+sLd4cAS4DHg/aVvlaRPA4vKok61vbaT3RERMczaDYjJwN62HwWQdArwX7aPWttMtucM0G+qGwHW9c0H5rdZX0REDLN2b7XxfODJlvEnS1tERGyg2t2DOAe4TtLFZfxI4JuNVBQREeuFdq9i+mdJlwGvKU3vt/2r5sqKiIhOa/cQE8AWwCO2/xVYLmnHhmqKiIj1QLvfpD4Z+CTPXGq6CfDtpoqKiIjOa3cP4q3AEcCfAGzfA2zdVFEREdF57QbEk+WSVANI2rK5kiIiYn3QbkB8V9JZVHdb/QBwJXl4UETEBm3Aq5gkCbgAeCnwCLALcJLtKxquLSIiOmjAgLBtSQtt7071XIaIiBgD2j3EdIOkfRqtJCIi1ivtfpN6P+AoSXdSXckkqp2LPZoqLCIiOmutASFpqu27qR4BGhERY8hAexCXUN3F9S5JF9l+2wjUFBER64GBzkGoZfhFTRYSERHrl4ECwv0MR0TEBm6gQ0wzJD1CtSexeRmGZ05SP7fR6iIiomPWGhC2xw1l4ZJmAv8KjAO+bvuzffpPAw4uo1sAO9jetvQ9Bdxc+u62fcRQaomIiHXT7mWu60zSOOAM4A3AcmCRpAW2F/dOY/tjLdN/GNirZRGP296zqfoiImLt1uV5EOtqX2CJ7aW2nwTOB2atZfo5wHkN1hMREeugyYCYBCxrGV9e2p5F0jRgR+BHLc2bSeqWdK2kI/tbiaS5Zbrunp6eYSg7IiKg2YBYF7OB79l+qqVtmu0u4N3AlyS9uG5G2/Nsd9numjBhwkjUGhExJjQZECuAKS3jk0tbndn0Obxke0X5uRS4ijXPT0RERMOaDIhFwM6SdpS0KVUILOg7kaSXAtsBv2hp207Sc8rweGB/YHHfeSMiojmNXcVke7Wk44HLqS5znW/7VkmnAt22e8NiNnB+eWJdr12BsyQ9TRVin229+ikiIprXWEAA2F4ILOzTdlKf8VNq5rsG2L3J2iIiYu3Wl5PUERGxnklARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRqNCAkzZR0u6Qlkk6o6T9GUo+kG8vruJa+oyXdUV5HN1lnREQ8W2OPHJU0DjgDeAOwHFgkaUHNs6UvsH18n3m3B04GugAD15d5H2yq3oiIWFOTexD7AktsL7X9JHA+MKvNeQ8DrrC9qoTCFcDMhuqMiIgaTQbEJGBZy/jy0tbX2yTdJOl7kqas47xImiupW1J3T0/PcNQdERF0/iT1fwLTbe9BtZfwzXVdgO15trtsd02YMGHYC4yIGKuaDIgVwJSW8cml7X/YfsD2E2X068Ar2p03IiKa1WRALAJ2lrSjpE2B2cCC1gkkTWwZPQK4rQxfDhwqaTtJ2wGHlraIiBghjV3FZHu1pOOpPtjHAfNt3yrpVKDb9gLgbyQdAawGVgHHlHlXSfo0VcgAnGp7VVO1RkTEszUWEAC2FwIL+7Sd1DJ8InBiP/POB+Y3WV9ERPSv0yepIyJiPZWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKjVaEBIminpdklLJJ1Q0/9xSYsl3STpvyVNa+l7StKN5bWg77wREdGsxh45KmkccAbwBmA5sEjSAtuLWyb7FdBl+zFJHwT+L/Cu0ve47T2bqi8iItauyT2IfYEltpfafhI4H5jVOoHtH9t+rIxeC0xusJ6IiFgHTQbEJGBZy/jy0tafY4HLWsY3k9Qt6VpJR/Y3k6S5Zbrunp6eIRUcERHPaOwQ07qQdBTQBRzY0jzN9gpJLwJ+JOlm27/rO6/tecA8gK6uLo9IwRERY0CTexArgCkt45NL2xokHQL8PXCE7Sd6222vKD+XAlcBezVYa0RE9NFkQCwCdpa0o6RNgdnAGlcjSdoLOIsqHFa2tG8n6TlleDywP9B6cjsiIhrW2CEm26slHQ9cDowD5tu+VdKpQLftBcDnga2ACyUB3G37CGBX4CxJT1OF2Gf7XP0UERENa/QchO2FwMI+bSe1DB/Sz3zXALs3WVtERKxdvkkdERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRoNCEkzJd0uaYmkE2r6nyPpgtL/S0nTW/pOLO23SzqsyTojIuLZGgsISeOAM4A3ArsBcyTt1meyY4EHbe8EnAZ8rsy7GzAbeBkwE/hqWV5ERIyQJvcg9gWW2F5q+0ngfGBWn2lmAd8sw98DXi9Jpf1820/Y/j2wpCwvIiJGyMYNLnsSsKxlfDmwX3/T2F4t6WHgeaX92j7zTqpbiaS5wNwy+kdJtw+y3vHA/YOcd7Qa0jbf9bk3/89wlevDq6Hlj7X3eVi3t+n3fJiMmfe45f0YL2mw2zytv44mA2JE2J4HzBvqciR12+4ahpJGjWzzhm+sbS9km4dTk4eYVgBTWsYnl7baaSRtDGwDPNDmvBER0aAmA2IRsLOkHSVtSnXSeUGfaRYAR5fhtwM/su3SPrtc5bQjsDNwXYO1RkREH40dYirnFI4HLgfGAfNt3yrpVKDb9gLg34FvSVoCrKIKEcp03wUWA6uBD9l+qqlaiyEfphqFss0bvrG2vZBtHjaq/mCPiIhYU75JHRERtRIQERFRa8wFxFBu/zEatbG9H5e0WNJNkv5bUr/XRI8WA21zy3Rvk2RJo/6SyHa2WdI7y3t9q6RzR7rG4dbGv+2pkn4s6Vfl3/fhnahzuEiaL2mlpFv66ZekL5ffx02S9h7ySm2PmRfVyfLfAS8CNgV+DezWZ5q/Bs4sw7OBCzpdd8PbezCwRRn+4Gje3na3uUy3NXA11Rcyuzpd9wi8zzsDvwK2K+M7dLruEdjmecAHy/BuwJ2drnuI2/xaYG/gln76DwcuAwS8EvjlUNc51vYghnL7j9FowO21/WPbj5XRa6m+czKatfMeA3ya6t5ffx7J4hrSzjZ/ADjD9oMAtleOcI3DrZ1tNvDcMrwNcM8I1jfsbF9NdbVnf2YB57hyLbCtpIlDWedYC4i623/0vYXHGrf/AHpv/zEatbO9rY6l+gtkNBtwm8uu9xTb/zWShTWonff5JcBLJP1c0rWSZo5Ydc1oZ5tPAY6StBxYCHx4ZErrmHX9/z6gUX+rjRgeko4CuoADO11LkyRtBHwROKbDpYy0jakOMx1EtZd4taTdbT/UyaIaNgc42/YXJL2K6jtXL7f9dKcLGy3G2h7EUG7/MRq1dcsSSYcAfw8cYfuJEaqtKQNt89bAy4GrJN1Jdax2wSg/Ud3O+7wcWGD7L67ukPxbqsAYrdrZ5mOB7wLY/gWwGdWN/DZUw36LorEWEEO5/cdoNOD2StoLOIsqHEb7cWkYYJttP2x7vO3ptqdTnXc5wnZ3Z8odFu38u76Eau8BSeOpDjktHcEah1s723w38HoASbtSBUTPiFY5shYA7ytXM70SeNj2vUNZ4Jg6xOQh3P5jNGpzez8PbAVcWM7F3237iI4VPURtbvMGpc1tvhw4VNJi4Cngb22P1j3jdrf5E8DXJH2M6oT1MaP4jz0knUcV8uPLeZWTgU0AbJ9JdZ7lcKrn5zwGvH/I6xzFv6+IiGjQWDvEFBERbUpARERErQRERETUSkBEREStBERERNRKQET0IenIcpfXl3a6lohOSkBEPNsc4GflZyMkjWtq2RHDJQER0ULSVsABVLdpmF3axkn6f5JuKffZ/3Bp30fSNZJ+Lek6SVtLOkbS6S3Lu1TSQWX4j5K+IOnXwKsknSRpUVnuvN67BkvaSdKVZbk3SHqxpHMkHdmy3O9IqrtLbcSwSUBErGkW8APbvwUekPQKYC4wHdjT9h7Ad8rtHS4APmJ7BnAI8PgAy96S6h79M2z/DDjd9j62Xw5sDry5TPcdqltzzwBeDdxL9Q3/YwAkbVPaN5S70cZ6KgERsaY5VM8WoPycQ/Xhf1a5/Tu2VwG7APfaXlTaHuntX4ungItaxg9W9dTCm4HXAS+TtDUwyfbFZbl/tv2Y7Z9Q3XtoQqnpojbWFzEkY+peTBFrI2l7qg/q3SWZ6h4/proxXLtWs+YfXpu1DP/Z9lNlXZsBX6V6mt0ySaf0mbbOOcBRVIe+hnyfnYiBZA8i4hlvB75le1q52+sU4PdUj7P8q3L7994guR2YKGmf0rZ16b8T2FPSRpKmUD35rE5vGNxfznu8HcD2o8Dy3vMNqp6RvkWZ9mzgo2W6xcO21RH9SEBEPGMOcHGftouAiVS3jr6pnGB+d3nM5buAr5S2K6g+9H9OFSqLgS8DN9StqDyo52vALVR3JG3dS3kv8DeSbgKuAV5Q5vkDcBvwjaFuaEQ7cjfXiFGi7EncDOxt++FO1xMbvuxBRIwC5al/twFfSTjESMkeRERE1MoeRERE1EpARERErQRERETUSkBEREStBERERNT6/5WLAWlxQhHkAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def load_jsonl_files_recursively(dir_path):\n", + " all_data = []\n", + " \n", + " for root, _, files in os.walk(dir_path):\n", + " for file in files:\n", + " if file.endswith(\".jsonl\"):\n", + " file_path = os.path.join(root, file)\n", + " with open(file_path, \"r\") as f:\n", + " file_data = [json.loads(line) for line in f]\n", + " all_data.extend(file_data)\n", + " \n", + " return all_data\n", + "\n", + "def extract_accuracies(data):\n", + " accuracies = []\n", + " for record in data:\n", + " if 'final_report' in record:\n", + " accuracy = record['final_report']['accuracy']\n", + " accuracies.append(accuracy)\n", + " return accuracies\n", + "\n", + "# Load the data recursively\n", + "dir_path = \"evals\"\n", + "data = load_jsonl_files_recursively(dir_path)\n", + "\n", + "# Extract accuracies from the data\n", + "accuracies = extract_accuracies(data)\n", + "\n", + "# Plot the accuracies in a histogram chart\n", + "plt.hist(accuracies, bins=100, range=(0, 1), edgecolor='black')\n", + "plt.xlabel(\"Accuracy\")\n", + "plt.ylabel(\"Frequency\")\n", + "plt.title(\"Accuracy Histogram\")\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counts for each eval_name:\n", + "test-match.s1.simple-v0: 22\n", + "None: 45\n", + "test-fuzzy-match.s1.simple-v0: 2\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def print_graph():\n", + " directory = 'evals/'\n", + " files = os.listdir(directory)\n", + " \n", + " eval_name_counter = {}\n", + " \n", + " for file in files:\n", + " if file.endswith(\".jsonl\"):\n", + " with open(os.path.join(directory, file), 'r') as f:\n", + " jsonl_content = f.read()\n", + " \n", + " # Read the JSONL content into a DataFrame\n", + " data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n", + " df = pd.DataFrame(data)\n", + "\n", + " if 'spec' not in df.columns:\n", + " continue\n", + "\n", + " # Extract the \"eval_name\" from the \"spec\" dictionaries\n", + " df['eval_name'] = df['spec'].apply(lambda x: x['eval_name'] if isinstance(x, dict) else None)\n", + "\n", + " for eval_name in df['eval_name']:\n", + " if eval_name not in eval_name_counter:\n", + " eval_name_counter[eval_name] = 0\n", + " eval_name_counter[eval_name] += 1\n", + "\n", + " # Print the counts\n", + " print(\"Counts for each eval_name:\")\n", + " for eval_name, count in eval_name_counter.items():\n", + " print(f\"{eval_name}: {count}\")\n", + "\n", + "print_graph()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counts for each eval_name:\n", + "test-match.s1.simple-v0: 22\n", + "test-fuzzy-match.s1.simple-v0: 2\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def print_graph():\n", + " directory = 'evals/'\n", + " files = os.listdir(directory)\n", + " \n", + " eval_name_counter = {}\n", + " \n", + " for file in files:\n", + " if file.endswith(\".jsonl\"):\n", + " with open(os.path.join(directory, file), 'r') as f:\n", + " jsonl_content = f.read()\n", + " \n", + " # Read the JSONL content into a DataFrame\n", + " data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n", + " df = pd.DataFrame(data)\n", + "\n", + " if 'spec' not in df.columns:\n", + " continue\n", + "\n", + " # Filter the DataFrame to only include rows with the \"spec\" key\n", + " spec_df = df[df['spec'].notna()].copy()\n", + "\n", + " # Extract the \"eval_name\" from the \"spec\" dictionaries\n", + " spec_df.loc[:, 'eval_name'] = spec_df['spec'].apply(lambda x: x['eval_name'])\n", + "\n", + " for eval_name in spec_df['eval_name']:\n", + " if eval_name not in eval_name_counter:\n", + " eval_name_counter[eval_name] = 0\n", + " eval_name_counter[eval_name] += 1\n", + "\n", + " # Print the counts\n", + " print(\"Counts for each eval_name:\")\n", + " for eval_name, count in eval_name_counter.items():\n", + " print(f\"{eval_name}: {count}\")\n", + "\n", + "print_graph()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.0 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- cgit v1.2.3 From b8c7c05dd5f88c02878ad028869bca81f500dd5d Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Sat, 22 Apr 2023 19:17:28 -0400 Subject: windows docs make workspace if not there (#25) * windows docs make workspace if not there * small fixes --- README.md | 43 +++++++++++++++++------------------ auto_gpt_benchmarking/AutoGPTAgent.py | 3 +++ 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index fe8bb6b04..c4a891861 100644 --- a/README.md +++ b/README.md @@ -13,54 +13,53 @@ documented. Clone the repo with: - `git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git` - `cd Auto-GPT-Benchmarks` + git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git + cd Auto-GPT-Benchmarks Create a venv with - `python3.9 -m venv venv` + python3.9 -m venv venv -Activate it with +On MaxOS/Linux Activate it with - `source venv/bin/activate` + source venv/bin/activate + +On Windows: + + venv/scripts/activate Install the requirements with: - `pip install -r requirements.txt` + pip install -r requirements.txt If you haven't already clone the AutoGPT repo somewhere else on your machine. DO NOT CLONE IT INTO A SUBDIR OF THIS REPO. - `cd somewhere/else` - `git clone git@github.com:Significant-Gravitas/Auto-GPT.git` + cd somewhere/else + git clone git@github.com:Significant-Gravitas/Auto-GPT.git + cd Auto-GPT + git checkout stable # Or the branch you want to benchmark -You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at: +You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at. This should becopied from the .env.template as described in the Auto-GPT README.md - `Auto-GPT/.env` + Auto-GPT/.env Finally, we assume you have a docker container built from the Dockerfile in the Auto-GPT repo. Build this with: - `cd Auto-GPT` - `docker build -t autogpt .` - -If you want to run with redis as your memory system, you can stand up a redis image in the AutoGPT repo with - - `docker compose up` - -Then you will need to adjust some variables in your .env file to use the redis memory backend. -See the AutoGPT docs on how to do that. + cd Auto-GPT + docker build -t autogpt . Run your first eval with: - `cd Auto-GPT-Benchmarks` - `python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT` + cd Auto-GPT-Benchmarks + python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in - `auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml`. + auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml. To see a full list of available flags you can use run `python3 -m auto_gpt_benchmarking --help` Some of these are inherited from the openAI evals framework and do not work quite as intended as they are not applicable diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py index 63cebf1cb..26d0f4e5b 100644 --- a/auto_gpt_benchmarking/AutoGPTAgent.py +++ b/auto_gpt_benchmarking/AutoGPTAgent.py @@ -108,6 +108,9 @@ class AutoGPTAgent: def __init__(self, prompt, auto_gpt_path: str): self.auto_gpt_path = Path(auto_gpt_path) self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace" + # if the workspace doesn't exist, create it + if not self.auto_workspace.exists(): + self.auto_workspace.mkdir() self.prompt_file = self.auto_workspace / "prompt.txt" self.output_file = self.auto_workspace / "output.txt" self.file_logger = self.auto_workspace / "file_logger.txt" -- cgit v1.2.3 From 04722e7fc5a5e24ef70b15e22be4dcff764c5367 Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Wed, 3 May 2023 10:14:44 -0400 Subject: EvalNames with dates for the eval run filename and compatibility with 0.3.0 (#26) * EvalNames with dates and the eval run * Ignore .idea files, update readme to use 3.10, updates for 0.3.0 --- .gitignore | 2 ++ README.md | 2 +- auto_gpt_benchmarking/AutoGPTAgent.py | 13 ++++++++----- auto_gpt_benchmarking/__main__.py | 6 ++++-- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index e68877ae9..04a0b6b0e 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,5 @@ dmypy.json .pyre/ /data + +/.idea diff --git a/README.md b/README.md index c4a891861..8e0a63c40 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Clone the repo with: Create a venv with - python3.9 -m venv venv + python3.10 -m venv venv On MaxOS/Linux Activate it with diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py index 26d0f4e5b..7a60009d9 100644 --- a/auto_gpt_benchmarking/AutoGPTAgent.py +++ b/auto_gpt_benchmarking/AutoGPTAgent.py @@ -80,15 +80,17 @@ class AutoGPTAgent: envs = [ f"{line.strip()}" for line in open( env_file - ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n"] + ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n" and "=" in line and not line.startswith('SMART_LLM_MODEL')] + + envs.append("SMART_LLM_MODEL=gpt-3.5-turbo") self.container = client.containers.run( image="autogpt", - command="--continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'", + command="--continuous -C '/app/auto_gpt_workspace/ai_settings.yaml' --skip-news", environment=envs, volumes={ - self.auto_workspace: {"bind": "/home/appuser/auto_gpt_workspace", "mode": "rw"}, - f"{self.auto_gpt_path}/autogpt": {"bind": "/home/appuser/autogpt", "mode": "rw"}, + self.auto_workspace: {"bind": "/app/auto_gpt_workspace", "mode": "rw"}, + f"{self.auto_gpt_path}/autogpt": {"bind": "/app/autogpt", "mode": "rw"}, }, stdin_open=True, tty=True, @@ -103,11 +105,12 @@ class AutoGPTAgent: """ while True: if self.output_file.exists(): + print("Output file exists") return self.output_file.read_text() def __init__(self, prompt, auto_gpt_path: str): self.auto_gpt_path = Path(auto_gpt_path) - self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace" + self.auto_workspace = self.auto_gpt_path / "autogpt" / "auto_gpt_workspace" # if the workspace doesn't exist, create it if not self.auto_workspace.exists(): self.auto_workspace.mkdir() diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py index c42c73b8e..84761a65d 100644 --- a/auto_gpt_benchmarking/__main__.py +++ b/auto_gpt_benchmarking/__main__.py @@ -56,8 +56,7 @@ def parse_args() -> argparse.Namespace: "--visible", action=argparse.BooleanOptionalAction, default=None) parser.add_argument("--seed", type=int, default=20220722) parser.add_argument("--user", type=str, default="") - parser.add_argument("--record_path", type=str, default=str(Path( - __file__).parent.parent / "data" / f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl")) + parser.add_argument("--record_path", type=str, default=None) parser.add_argument( "--log_to_file", type=str, default=None, # default=str( # Path(__file__).parent.parent / "data" / "log" / "log.txt" @@ -133,6 +132,9 @@ if __name__ == "__main__": # Obviously, a top level import would be better. This allows us to set the API key with the env file, as it gets # set in the evaluator. We can't set it before the import because the import will fail without an API key. from auto_gpt_benchmarking.Evaluator import Evaluator, OAIRunArgs + if args.record_path is None: + args.record_path = str(Path( + __file__).parent.parent / "data" / f"eval-{args.eval}-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl") run_args = OAIRunArgs( completion_fn=args.completion_fn, eval=args.eval, -- cgit v1.2.3 From dfb73204bf8c278cc4f50155a7a71b14b55d8b3a Mon Sep 17 00:00:00 2001 From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com> Date: Fri, 5 May 2023 16:33:39 -0400 Subject: Update readme to suggest people check out challenges --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 8e0a63c40..e84ff1af8 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +# Closing in favor of Challenges! +Please check out challenges run in our CI pipeline: https://github.com/Significant-Gravitas/Auto-GPT/tree/master/tests/integration/challenges + # Auto-GPT-Benchmarks A set of standardised benchmarks to assess the performance of Auto-GPT. This currently uses the OpenAI Evals framework to run the benchmarks. -- cgit v1.2.3 From c6a22abb10c6c2d3d25814a24b269ad250945243 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 18 Jun 2023 07:30:54 -0700 Subject: Initial commit --- LICENSE | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 2 + 2 files changed, 203 insertions(+) create mode 100644 LICENSE create mode 100644 README.md diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 000000000..b42ff76ea --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Auto-GPT-Factory +Build your own Auto-GPT -- cgit v1.2.3 From 7d4d51ccbbf9912c74353c8088d5c12ffec99823 Mon Sep 17 00:00:00 2001 From: Merwane Hamadi Date: Sun, 18 Jun 2023 07:55:16 -0700 Subject: Setup --- .gitignore | 162 ++++++++++++++++++++++++++++++++++++++ LICENSE | 222 +++++----------------------------------------------- autogpt/__init__.py | 0 tests/__init__.py | 0 4 files changed, 183 insertions(+), 201 deletions(-) create mode 100644 .gitignore create mode 100644 autogpt/__init__.py create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..29a0285a8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +## Original ignores +autogpt/keys.py +autogpt/*.json +**/auto_gpt_workspace/* +*.mpeg +.env +azure.yaml +ai_settings.yaml +last_run_ai_settings.yaml +.vscode +.idea/* +auto-gpt.json +log.txt +log-ingestion.txt +logs +*.log +*.mp3 +mem.sqlite3 +venvAutoGPT + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +plugins/ +plugins_config.yaml +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +site/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.direnv/ +.env +.venv +env/ +venv*/ +ENV/ +env.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ +llama-* +vicuna-* + +# mac +.DS_Store + +openai/ + +# news +CURRENT_BULLETIN.md diff --git a/LICENSE b/LICENSE index 261eeb9e9..601935b85 100644 --- a/LICENSE +++ b/LICENSE @@ -1,201 +1,21 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. +MIT License + +Copyright (c) 2023 Toran Bruce Richards + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/autogpt/__init__.py b/autogpt/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb -- cgit v1.2.3 From 51f2295971888026275bde4127945df8b182d731 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 18 Jun 2023 11:14:54 -0400 Subject: init agbenchmark --- .gitignore | 37 +++- .gitmodules | 0 .vscode/settings.json | 6 + LICENSE | 2 +- README.md | 191 +++++++++--------- agbenchmark/__init__.py | 0 agbenchmark/benchmark/__init__.py | 0 agbenchmark/benchmark/benchmark.py | 1 + agbenchmark/benchmark/challenges/Challenge.py | 0 agbenchmark/benchmark/challenges/__init__.py | 0 .../benchmark/challenges/adaptability/a1_test.py | 0 .../challenges/basic_abilities/browse_test.py | 0 .../challenges/basic_abilities/read_file_test.py | 0 .../basic_abilities/remember_context_test.py | 0 .../challenges/basic_abilities/write_file_test.py | 0 agbenchmark/benchmark/challenges/code/c1_test.py | 0 agbenchmark/benchmark/challenges/memory/m1_test.py | 0 .../benchmark/challenges/retrieval/r1_test.py | 0 agbenchmark/benchmark/challenges/utils.py | 0 .../challenges/web_navigation/wn1_test.py | 0 .../benchmark/challenges/writing/w1_test.py | 0 agbenchmark/benchmark/run.py | 1 + agbenchmark/server/__init__.py | 0 agbenchmark/server/api.py | 0 agbenchmark/server/utils.py | 0 agbenchmark/workspace/__init__.py | 0 agbenchmark/workspace/cloud_services/aws.py | 0 agbenchmark/workspace/workspace_manager.py | 1 + auto_gpt_benchmarking/AutoGPTAgent.py | 155 --------------- auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml | 6 - auto_gpt_benchmarking/CompletionFn.py | 34 ---- auto_gpt_benchmarking/Evaluator.py | 61 ------ auto_gpt_benchmarking/__init__.py | 0 auto_gpt_benchmarking/__main__.py | 158 --------------- .../completion_fns/auto_gpt_completion_fn.yaml | 4 - evals_analytics.ipynb | 220 --------------------- poetry.lock | 101 ++++++++++ pyproject.toml | 23 +++ requirements.txt | 81 -------- tests/__init__.py | 0 tests/test_api.py | 0 tests/test_benchmark.py | 0 tests/test_workspace_manager.py | 0 43 files changed, 267 insertions(+), 815 deletions(-) delete mode 100644 .gitmodules create mode 100644 .vscode/settings.json create mode 100644 agbenchmark/__init__.py create mode 100644 agbenchmark/benchmark/__init__.py create mode 100644 agbenchmark/benchmark/benchmark.py create mode 100644 agbenchmark/benchmark/challenges/Challenge.py create mode 100644 agbenchmark/benchmark/challenges/__init__.py create mode 100644 agbenchmark/benchmark/challenges/adaptability/a1_test.py create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/browse_test.py create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py create mode 100644 agbenchmark/benchmark/challenges/code/c1_test.py create mode 100644 agbenchmark/benchmark/challenges/memory/m1_test.py create mode 100644 agbenchmark/benchmark/challenges/retrieval/r1_test.py create mode 100644 agbenchmark/benchmark/challenges/utils.py create mode 100644 agbenchmark/benchmark/challenges/web_navigation/wn1_test.py create mode 100644 agbenchmark/benchmark/challenges/writing/w1_test.py create mode 100644 agbenchmark/benchmark/run.py create mode 100644 agbenchmark/server/__init__.py create mode 100644 agbenchmark/server/api.py create mode 100644 agbenchmark/server/utils.py create mode 100644 agbenchmark/workspace/__init__.py create mode 100644 agbenchmark/workspace/cloud_services/aws.py create mode 100644 agbenchmark/workspace/workspace_manager.py delete mode 100644 auto_gpt_benchmarking/AutoGPTAgent.py delete mode 100644 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml delete mode 100644 auto_gpt_benchmarking/CompletionFn.py delete mode 100644 auto_gpt_benchmarking/Evaluator.py delete mode 100644 auto_gpt_benchmarking/__init__.py delete mode 100644 auto_gpt_benchmarking/__main__.py delete mode 100644 auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml delete mode 100644 evals_analytics.ipynb create mode 100644 poetry.lock create mode 100644 pyproject.toml delete mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 tests/test_api.py create mode 100644 tests/test_benchmark.py create mode 100644 tests/test_workspace_manager.py diff --git a/.gitignore b/.gitignore index 04a0b6b0e..68bc17f9f 100644 --- a/.gitignore +++ b/.gitignore @@ -20,7 +20,6 @@ parts/ sdist/ var/ wheels/ -pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg @@ -50,6 +49,7 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +cover/ # Translations *.mo @@ -72,6 +72,7 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ # Jupyter Notebook @@ -82,7 +83,9 @@ profile_default/ ipython_config.py # pyenv -.python-version +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. @@ -91,7 +94,22 @@ ipython_config.py # install all needed dependencies. #Pipfile.lock -# PEP 582; used by e.g. github.com/David-OConnor/pyflow +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff @@ -128,6 +146,15 @@ dmypy.json # Pyre type checker .pyre/ -/data +# pytype static type analyzer +.pytype/ -/.idea +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index e69de29bb..000000000 diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..3445835be --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" + }, + "python.formatting.provider": "none" +} diff --git a/LICENSE b/LICENSE index 601935b85..696ff02ba 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Toran Bruce Richards +Copyright (c) 2023 Silen Naihin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index e84ff1af8..820c0f51e 100644 --- a/README.md +++ b/README.md @@ -1,120 +1,131 @@ -# Closing in favor of Challenges! -Please check out challenges run in our CI pipeline: https://github.com/Significant-Gravitas/Auto-GPT/tree/master/tests/integration/challenges +# agbenchmark -# Auto-GPT-Benchmarks -A set of standardised benchmarks to assess the performance of Auto-GPT. -This currently uses the OpenAI Evals framework to run the benchmarks. +A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work -## Setup +#### MVP: function calls api, api returns presigned url, folder is uploaded, write file challenge is measured, score is given -You must add the auto_gpt_benchmarking dir to the python path -Do this with a path file in your venv. OpenAI evals needs to import it. +#### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x -These instructions currently assume ubuntuy 22.04. -They should be fairly adaptable to the windows/MacOS equivalents. Please submit a PR if you would like to see your OS -documented. +## Contributing -Clone the repo with: +- Make sure you have `poetry` installed - `pip install poetry`. +- Then `poetry install` for dependencies - git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git - cd Auto-GPT-Benchmarks +- To add requirements `poetry add requirement`. +- To run in venv `poetry run python script.py` -Create a venv with +Feel free to merge with `main` at will (but also to ask for review) - if you can't send msg in R&D chat for access. - python3.10 -m venv venv +If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit +Let people know what beautiful code you write does, document everything well -On MaxOS/Linux Activate it with +Share your progress :) - source venv/bin/activate +## Api -On Windows: +FastAPI with REST, import requests + +``` +POST hostname:8080/challenges +{ + "test_name": "" + "challenge": "memory" - optional +} +``` - venv/scripts/activate +## Auth: -Install the requirements with: +get preSignedUrl from API - pip install -r requirements.txt +``` +POST preSignedUrl +{ + "artifacts": [{}] +} +``` -If you haven't already clone the AutoGPT repo somewhere else on your machine. -DO NOT CLONE IT INTO A SUBDIR OF THIS REPO. +## Workspace - cd somewhere/else - git clone git@github.com:Significant-Gravitas/Auto-GPT.git - cd Auto-GPT - git checkout stable # Or the branch you want to benchmark +Kubernetes with AWS3 or GCP -You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at. This should becopied from the .env.template as described in the Auto-GPT README.md +## Challenges - Auto-GPT/.env +#### Dataset -Finally, we assume you have a docker container built from the Dockerfile in the Auto-GPT repo. +Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ -Build this with: +#### Simple challenge creation through a DSL (domain specific language) - cd Auto-GPT - docker build -t autogpt . +``` +Challenge TicTacToeCoding + Description "The agent should implement a basic tic-tac-toe game in Python." + Artifacts { + Code "tictactoe.py" + } + Tasks { + Code "Write a function to initialize the game board." + Code "Write a function to handle a player's turn." + Code "Write a function to check for a winning move." + Test "Write tests for the blog post model, serializer, and view." + Command "Run Django's test suite to ensure everything is working as expected." + } + SuccessCriteria { + Correctness "The game should correctly alternate between two players." + Correctness "The game should correctly identify a winning move." + Efficiency "The game should not use unnecessary computational resources." + Design "The solution should follow good practices for Django and Django Rest Framework." + } +EndChallenge +``` -Run your first eval with: +#### Validators - cd Auto-GPT-Benchmarks - python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT +Designed to handle specific types of output (e.g., text, code, structured data) -You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in +#### Logging - auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml. +Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc -To see a full list of available flags you can use run `python3 -m auto_gpt_benchmarking --help` -Some of these are inherited from the openAI evals framework and do not work quite as intended as they are not applicable -to this use case. +#### Written Challenges -This saves a file in `Auto-GPT-Benchmarks/data/records.jsonl` -This file is currently a default that is configurable with --record_path flag. You will have to specify the fully -qualified path. +For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore -## Currently Supported Benchmarks: -From OpenAI Evals -- [x] test-match -- [x] test-fuzzy-match -- [ ] Everything else they have... +## Repo -## Understanding OpenAI Evals - -The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs - -The basic idea is this though: -1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test. -2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir. -3. Run the evals against the completion function. - -Then you can make more also, yaml defined evals and run them against the completion function as needed. - -### Completions Functions - -See our yaml file in `completion_fns` dir for the registration of the completion function. -See our completion function itself in CompletionFn.py -That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py - - -# Example final output: - -/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl -{"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}} -{"final_report": {"accuracy": 0.3333333333333333}} -{"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"} -{"run_id": "230417220821DPM75QNS", "event_id": 1, "sample_id": "test-match.s1.2", "type": "match", "data": {"correct": false, "expected": "15", "picked": null, "sampled": "OpenAI was founded in 2015.2015", "options": ["15"]}, "created_by": "", "created_at": "2023-04-17 22:10:13.127550+00:00"} -{"run_id": "230417220821DPM75QNS", "event_id": 2, "sample_id": "test-match.s1.1", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: The first US president was \nAssistant: ", "sampled": "George Washington"}, "created_by": "", "created_at": "2023-04-17 22:11:17.761693+00:00"} -{"run_id": "230417220821DPM75QNS", "event_id": 3, "sample_id": "test-match.s1.1", "type": "match", "data": {"correct": true, "expected": "George Washington", "picked": "George Washington", "sampled": "George Washington", "options": ["George Washington"]}, "created_by": "", "created_at": "2023-04-17 22:11:17.761739+00:00"} -{"run_id": "230417220821DPM75QNS", "event_id": 4, "sample_id": "test-match.s1.0", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: Once upon a \nAssistant: ", "sampled": "Once upon a time"}, "created_by": "", "created_at": "2023-04-17 22:12:04.691026+00:00"} -{"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"} -(venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ - -# What is next? - -- [ ] Run the rest of the OpenAI Evals Especially the modelgraded ones -- [ ] Build longer form tasks, (code fix backed by testing) -- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project -- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used -- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. -- [ ] Figure our how the OpenAI Evals results are saved... -- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't. +``` +|-- agbenchmark/ **main project directory** +| |-- **init**.py +| |-- server/ +| | |-- **init**.py +| | |-- api.py **opens server on host and exposes urls** +| | |-- utils.py +| |-- benchmark/ +| | |-- **init**.py +| | |-- benchmark.py **combining scores, metrics, final evaluation** +| | |-- run.py **entry point. sets everything up** +| | |-- challenges/ **challenges across different metrics** +| | | |-- **init**.py +| | | |-- Challenge.py **easy challenge creation through Challenge class. potentially how DSL is defined. may need to inherit challenge class like Adaptability(Challenge)** +| | | |-- utils.py +| | | |-- adaptability.py +| | | |-- basic_abilities.py +| | | |-- code.py +| | | |-- memory.py +| | | |-- retrieval.py +| | | |-- web_navigation.py +| | | |-- writing.py +| |-- workspace/ **workspace related func** +| | |-- **init**.py +| | |-- workspace_manager.py **creation, deletion, preSignedUrl generation** +| | |-- cloud_services/ +| | | |-- **init**.py +| | | |-- aws.py **not finalized, but write, read, and del files** +|-- tests/ **test func of agbenchmark** +| |-- **init**.py +| |-- test_api.py +| |-- test_benchmark.py +| |-- test_workspace_manager.py +``` + +Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility diff --git a/agbenchmark/__init__.py b/agbenchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/__init__.py b/agbenchmark/benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/benchmark.py b/agbenchmark/benchmark/benchmark.py new file mode 100644 index 000000000..2f8124272 --- /dev/null +++ b/agbenchmark/benchmark/benchmark.py @@ -0,0 +1 @@ +# how well the agent did on the challenges, the metrics calculation diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/__init__.py b/agbenchmark/benchmark/challenges/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/adaptability/a1_test.py b/agbenchmark/benchmark/challenges/adaptability/a1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py b/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py b/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/code/c1_test.py b/agbenchmark/benchmark/challenges/code/c1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/memory/m1_test.py b/agbenchmark/benchmark/challenges/memory/m1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/utils.py b/agbenchmark/benchmark/challenges/utils.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/challenges/writing/w1_test.py b/agbenchmark/benchmark/challenges/writing/w1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/benchmark/run.py b/agbenchmark/benchmark/run.py new file mode 100644 index 000000000..b07ac6b55 --- /dev/null +++ b/agbenchmark/benchmark/run.py @@ -0,0 +1 @@ +# running all of the different challenges diff --git a/agbenchmark/server/__init__.py b/agbenchmark/server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/server/api.py b/agbenchmark/server/api.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/server/utils.py b/agbenchmark/server/utils.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/workspace/__init__.py b/agbenchmark/workspace/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/workspace/cloud_services/aws.py b/agbenchmark/workspace/cloud_services/aws.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/workspace/workspace_manager.py b/agbenchmark/workspace/workspace_manager.py new file mode 100644 index 000000000..cfcf3f7ac --- /dev/null +++ b/agbenchmark/workspace/workspace_manager.py @@ -0,0 +1 @@ +# Manages the workspaces including creation, deletion, etc diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py deleted file mode 100644 index 7a60009d9..000000000 --- a/auto_gpt_benchmarking/AutoGPTAgent.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -This instantiates an AutoGPT agent who is capable of handling any task. -It is designed to pass benchmarks as effectively as possible. - -Loads in the ai_settings.yaml file to get the AI's name, role, and goals. -Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation. - -The model is instantiated with a prompt from the AutoGPT completion function. - -Eventualy we will also save and log all of the associated output and thinking for the model as well -""" -from pathlib import Path -import docker -import asyncio -import aiodocker - - -class AutoGPTAgent: - """ - A class object that contains the configuration information for the AI - The init function takes an evaluation prompt. - It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo. - It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt - It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder. - If the model has used more than 50,000 tokens, it kills the model. - If the model has used less than 50,000 tokens, it returns the output.txt file. - """ - def _clean_up_workspace(self): - """ - Cleans up the workspace by deleting the prompt.txt and output.txt files. - :return: - """ - # check if the files are there and delete them if they are - if self.prompt_file.exists(): - self.prompt_file.unlink() - if self.output_file.exists(): - self.output_file.unlink() - if self.file_logger.exists(): - self.file_logger.unlink() - - def _copy_ai_settings(self) -> None: - self.ai_settings_dest.write_text(self.ai_settings_file.read_text()) - - def _copy_prompt(self) -> None: - self.prompt_file.write_text(self.prompt) - - async def _stream_logs(self, container: aiodocker.containers.DockerContainer) -> None: - try: - async for line in container.log(stdout=True, stderr=True, follow=True, tail="all"): - print(line.strip()) - await asyncio.sleep(1) - except aiodocker.exceptions.DockerError as e: - # Handle Docker errors (e.g., container is killed or removed) - print('Docker error: {}'.format(e)) - - async def _run_stream_logs(self) -> None: - """ - This grabs the docker containers id and streams the logs to the console with aiodocker. - :return: None - """ - async with aiodocker.Docker() as docker_client: - try: - container = docker_client.containers.container(self.container.id) - await self._stream_logs(container) - except aiodocker.exceptions.DockerError as e: - # Handle cases when the container is not found - print('Container not found: {}'.format(e)) - - def _start_agent(self): - """ - This starts the agent in the docker container. - This assumes you have the docker image built with: - docker build -t autogpt . - In the dockerfile in the Auto-GPT repo. - You also must set up the .env file in the Auto-GPT repo. - :return: - """ - client = docker.from_env() - env_file = self.auto_gpt_path / ".env" - envs = [ - f"{line.strip()}" for line in open( - env_file - ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n" and "=" in line and not line.startswith('SMART_LLM_MODEL')] - - envs.append("SMART_LLM_MODEL=gpt-3.5-turbo") - - self.container = client.containers.run( - image="autogpt", - command="--continuous -C '/app/auto_gpt_workspace/ai_settings.yaml' --skip-news", - environment=envs, - volumes={ - self.auto_workspace: {"bind": "/app/auto_gpt_workspace", "mode": "rw"}, - f"{self.auto_gpt_path}/autogpt": {"bind": "/app/autogpt", "mode": "rw"}, - }, - stdin_open=True, - tty=True, - detach=True - ) - asyncio.run(self._run_stream_logs()) - - def _poll_for_output(self): - """ - This polls the output file to see if the model has finished. - :return: - """ - while True: - if self.output_file.exists(): - print("Output file exists") - return self.output_file.read_text() - - def __init__(self, prompt, auto_gpt_path: str): - self.auto_gpt_path = Path(auto_gpt_path) - self.auto_workspace = self.auto_gpt_path / "autogpt" / "auto_gpt_workspace" - # if the workspace doesn't exist, create it - if not self.auto_workspace.exists(): - self.auto_workspace.mkdir() - self.prompt_file = self.auto_workspace / "prompt.txt" - self.output_file = self.auto_workspace / "output.txt" - self.file_logger = self.auto_workspace / "file_logger.txt" - self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml" - self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml" - self.prompt = prompt - self._clean_up_workspace() - self._copy_ai_settings() - self._copy_prompt() - self.container = None - self.killing = False - self.logging_task = None - - def start(self): - self._start_agent() - answer = self._poll_for_output() - print(f"Prompt was: {self.prompt}, Answer was: {answer}") - self.kill() - return answer - - def kill(self): - if self.killing: - return - self.killing = True - self._clean_up_workspace() - if self.container: - # kill the container - try: - self.container.kill() - self.container.remove() - except docker.errors.APIError: - print('Couldn\'t find container to kill. Assuming container successfully killed itself.') - if self.logging_task: - self.logging_task.cancel() - self.killing = False - - - - diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml deleted file mode 100644 index ab6caaed0..000000000 --- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml +++ /dev/null @@ -1,6 +0,0 @@ -ai_goals: -- Evaluate the prompt in `prompt.txt` and find the best answer in the format provided. -- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer. -- Save the final answer and output to the `output.txt` file, the only file you should write to, then immediately exit the program because you are done. -ai_name: EvaluationAgent -ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py deleted file mode 100644 index f82ede85c..000000000 --- a/auto_gpt_benchmarking/CompletionFn.py +++ /dev/null @@ -1,34 +0,0 @@ -from evals.api import CompletionFn, CompletionResult - -from evals.prompt.base import CompletionPrompt -from evals.record import record_sampling -from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent - - -class AutoGPTCompletionResult(CompletionResult): - def __init__(self, response) -> None: - self.response = response - - def get_completions(self) -> list[str]: - return [self.response.strip()] - - -class AutoGPTCompletionFn(CompletionFn): - - def __init__(self, auto_gpt_path, **kwargs) -> None: - self.auto_gpt_path = auto_gpt_path - self.agent = None - - def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult: - prompt = CompletionPrompt(prompt).to_formatted_prompt() - self.kill_agent() - self.agent = AutoGPTAgent(prompt, self.auto_gpt_path) - response = self.agent.start() - record_sampling(prompt=prompt, sampled=response) - return AutoGPTCompletionResult(response) - - def kill_agent(self): - if self.agent: - self.agent.kill() - - diff --git a/auto_gpt_benchmarking/Evaluator.py b/auto_gpt_benchmarking/Evaluator.py deleted file mode 100644 index 4301fb3bc..000000000 --- a/auto_gpt_benchmarking/Evaluator.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -The evaluator class actually executes the evals. -""" -from evals.cli import oaieval -from evals.registry import Registry -from pathlib import Path -from typing import List, Optional, Tuple -import sys - - -class OAIRunArgs: - def __init__( - self, - completion_fn: str, - eval: str, - extra_eval_params: str = "", - max_samples: int = None, - cache: bool = True, - visible: bool = None, - seed: int = 20220722, - user: str = "", - record_path: str = None, - log_to_file: str = None, - debug: bool = False, - local_run: bool = True, - dry_run: bool = False, - dry_run_logging: bool = True, - ): - self.completion_fn = completion_fn - self.eval = eval - self.extra_eval_params = extra_eval_params - self.max_samples = max_samples - self.cache = cache - self.visible = visible - self.seed = seed - self.user = user - self.record_path = record_path - self.log_to_file = log_to_file - self.debug = debug - self.local_run = local_run - self.dry_run = dry_run - self.dry_run_logging = dry_run_logging - # create the record and logging paths if they don't exist - Path(self.record_path).parent.mkdir(parents=True, exist_ok=True) - # Path(self.log_to_file).parent.mkdir(parents=True, exist_ok=True) - # Registry path should be the auto_gpt_benchmarking folder - self.registry_path = None - - -class Evaluator: - def __init__(self, oai_run_args: OAIRunArgs): - self.oai_run_args = oai_run_args - registry_path = Path(__file__).parent - - # add registry path to the python system path - sys.path.append(str(registry_path)) - self.oai_run_args.registry_path = [registry_path] - # self.registry = Registry([registry_path]) - - def run(self): - oaieval.run(self.oai_run_args) diff --git a/auto_gpt_benchmarking/__init__.py b/auto_gpt_benchmarking/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py deleted file mode 100644 index 84761a65d..000000000 --- a/auto_gpt_benchmarking/__main__.py +++ /dev/null @@ -1,158 +0,0 @@ -""" -This is the main evaluation file. In it you can specify the following: - -1. The number of threads to use for evaluation. This is set to 1 by default.And will remain that way until we can spin - up containers on command -2. The timeout for each thread. This is set to 60 seconds by default. This is the amount of time each thread will run - for before it is killed when evaluating an agent -3. The path to the AutoGPT code. This is a required parameter as we do not know where your code lives. -4. The evals you would like to run. The options here are any OpenAI eval, or any of the evals defined in this repository - - -What this file does is it parses the params given and then runs the evals with OpenAI's evals framework. -""" - -import argparse -import os -import sys -from pathlib import Path -from datetime import datetime -import yaml -from datetime import datetime - - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument( - "eval", type=str, help="Name of an eval. See registry.") - parser.add_argument( - "--completion-fn", - type=str, - dest="completion_fn", - default="auto_gpt_completion_fn", - help="One or more CompletionFn URLs, separated by commas (,). " - "A CompletionFn can either be the name of a model available in the OpenAI API or a key in the registry " - "(see evals/registry/completion_fns).", - ) - parser.add_argument( - "--timeout", - type=int, - default=300, - help="The timeout for each thread", - ) - parser.add_argument( - "--auto-gpt-path", - type=str, - default=None, - help="The path to the AutoGPT code. This updates auto_gpt_competion_fn.yaml in completion fns. " - "So you only need to set this once.", - ) - parser.add_argument("--extra_eval_params", type=str, default="") - parser.add_argument("--max_samples", type=int, default=None) - parser.add_argument( - "--cache", action=argparse.BooleanOptionalAction, default=True) - parser.add_argument( - "--visible", action=argparse.BooleanOptionalAction, default=None) - parser.add_argument("--seed", type=int, default=20220722) - parser.add_argument("--user", type=str, default="") - parser.add_argument("--record_path", type=str, default=None) - parser.add_argument( - "--log_to_file", type=str, default=None, # default=str( - # Path(__file__).parent.parent / "data" / "log" / "log.txt" - # ), help="Log to a file instead of stdout" - ) - parser.add_argument( - "--debug", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument( - "--local-run", action=argparse.BooleanOptionalAction, default=True) - parser.add_argument( - "--dry-run", action=argparse.BooleanOptionalAction, default=False) - parser.add_argument("--dry-run-logging", - action=argparse.BooleanOptionalAction, default=True) - return parser.parse_args() - - -def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) -> Path: - """ - If there is a given auto_gpt_path, then we need to update the yaml file to include it in the system path - If we don't have one. Then we get the path from the yaml. - If none exists in the yaml and we don't have a path then we raise an exception. - :param yaml_path: The path to the yaml file - :param auto_gpt_path: The path to the AutoGPT code - :return: The path to the AutoGPT code - """ - with open(yaml_path, "r") as f: - yaml_data = yaml.safe_load(f) - if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None: - raise Exception( - "You must specify a auto_gpt_path in the yaml file or pass it in as a parameter") - if auto_gpt_path is None: - auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] - if auto_gpt_path is not None: - yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] = auto_gpt_path - with open(yaml_path, "w") as f: - yaml.safe_dump(yaml_data, f) - - return Path(auto_gpt_path).absolute() - - -def load_env_file(env_path: Path): - if not env_path.exists(): - raise FileNotFoundError('You must set the OpenAI key in the AutoGPT env file. ' - 'We need your api keys to start the AutoGPT agent and use OpenAI evals') - with open(env_path, "r") as f: - # find the OPENAI_API_KEY key split it from the equals sign and assign it so OpenAI evals can use it. - for line in f.readlines(): - if line.startswith("OPENAI_API_KEY"): - os.environ["OPENAI_API_KEY"] = line.split("=")[1].strip() - break - - -if __name__ == "__main__": - args = parse_args() - # do not run in multiprocessing mode We do not use this right now, as it disables OpenAI's timeouts :( - # os.environ["EVALS_SEQUENTIAL"] = "1" - os.environ["EVALS_THREAD_TIMEOUT"] = str(args.timeout) - os.environ["EVALS_THREADS"] = str(1) - - # Update the yaml file with the auto_gpt_path - autogpt_path = update_yaml_with_auto_gpt_path( - str(Path(__file__).parent / "completion_fns" / - "auto_gpt_completion_fn.yaml"), - args.auto_gpt_path - ) - - # Add the benchmarks path to the system path so we can import auto_gpt_benchmarking - sys.path.append(str(Path(__file__).parent.parent.absolute())) - - # load all of the environment variables in the auto-gpt path/.env file - load_env_file(Path(autogpt_path) / ".env") - - # Obviously, a top level import would be better. This allows us to set the API key with the env file, as it gets - # set in the evaluator. We can't set it before the import because the import will fail without an API key. - from auto_gpt_benchmarking.Evaluator import Evaluator, OAIRunArgs - if args.record_path is None: - args.record_path = str(Path( - __file__).parent.parent / "data" / f"eval-{args.eval}-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl") - run_args = OAIRunArgs( - completion_fn=args.completion_fn, - eval=args.eval, - extra_eval_params=args.extra_eval_params, - max_samples=args.max_samples, - cache=args.cache, - visible=args.visible, - seed=args.seed, - user=args.user, - record_path=args.record_path, - log_to_file=args.log_to_file, - debug=args.debug, - local_run=args.local_run, - dry_run=args.dry_run, - dry_run_logging=args.dry_run_logging) - - # Run the evals - evaluator = Evaluator( - run_args - ) - evaluator.run() diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml deleted file mode 100644 index a101f000a..000000000 --- a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml +++ /dev/null @@ -1,4 +0,0 @@ -auto_gpt_completion_fn: - args: - auto_gpt_path: - class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn diff --git a/evals_analytics.ipynb b/evals_analytics.ipynb deleted file mode 100644 index f1b48424c..000000000 --- a/evals_analytics.ipynb +++ /dev/null @@ -1,220 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAc5klEQVR4nO3deZgddZ3v8feHALLKYoLGrCiIoBDABlRQQBEiKsHHLVEUvGDmOuK43RnBmQcYnHH0ehVHwYHoRERlEREmwwQRRhEVkTSILEEkRiAJYBrCpiAY+Nw/6tfDSVOdPunu6pNOf17Pc56u+v1q+Vaf5Hy6llMl20RERPS1UacLiIiI9VMCIiIiaiUgIiKiVgIiIiJqJSAiIqJWAiIiImolICI6TNJrJN3e6Toi+kpAREdJukrSg5Ke0+lamiDpIEnLa9qvknQcgO2f2t6ljWWdIunbTdQZUScBER0jaTrwGsDAESO87o1Hcn3ru/w+ok4CIjrpfcC1wNnA0a0dkqZI+r6kHkkPSDq9pe8Dkm6T9KikxZL2Lu2WtFPLdGdL+qcyfJCk5ZI+Kek+4BuStpN0aVnHg2V4csv820v6hqR7Sv8lpf0WSW9pmW4TSfdL2mswv4S+exmlxhVl+26X9HpJM4FPAe+S9EdJvy7TvlDSAkmrJC2R9IGW5Wwu6Zul9tsk/V2f9dxZ1nUT8CdJG0s6QdLvWn63b22Z/hhJP5d0mqSHJC2V9OrSvkzSSklrvI8xuiUgopPeB3ynvA6T9HwASeOAS4G7gOnAJOD80vcO4JQy73Op9jweaHN9LwC2B6YBc6n+/X+jjE8FHgdOb5n+W8AWwMuAHYDTSvs5wFEt0x0O3Gv7V23W0S9JuwDHA/vY3ho4DLjT9g+AzwAX2N7K9owyy/nAcuCFwNuBz0h6Xek7mer39yLgDX1q7jUHeBOwre3VwO+o9uq2Af4R+LakiS3T7wfcBDwPOLesfx9gp7L80yVtNdTfQ6wnbOeV14i/gAOAvwDjy/hvgI+V4VcBPcDGNfNdDnykn2Ua2Kll/Gzgn8rwQcCTwGZrqWlP4MEyPBF4GtiuZroXAo8Czy3j3wP+rp9lHlSW81Cf12rguJZplpfhnYCVwCHAJn2WdQrw7ZbxKcBTwNYtbf8CnF2GlwKHtfQd17ueMn4n8L8GeJ9uBGaV4WOAO1r6di+/8+e3tD0A7Nnpf195Dc8rexDRKUcDP7R9fxk/l2cOM00B7nL1F21fU6j+yh2MHtt/7h2RtIWksyTdJekR4Gpg27IHMwVYZfvBvguxfQ/wc+BtkrYF3ki1F9Sfe2xv2/oCflY3oe0lwEepwmClpPMlvbCf5b6w1PhoS9tdVHtcvf3LWvpah2vbJL1P0o3lENJDwMuB8S2T/KFl+PFSc9+27EFsIBIQMeIkbQ68EzhQ0n3lnMDHgBmSZlB9aE3t58TpMuDF/Sz6MapDQr1e0Ke/762LPwHsAuxn+7nAa3tLLOvZvgRAnW9SHVJ5B/AL2yv6mW6d2T7X9gFUh74MfK6f+u8pNW7d0jYV6K3lXmByS9+UutX1DkiaBnyN6hDX80qQ3UL1+4gxKAERnXAk1aGR3agO6+wJ7Ar8lOrcwnVUH26flbSlpM0k7V/m/TrwfyS9QpWdygcbVIdD3i1pXDmpe+AAdWxN9RfvQ5K2pzpmD4Dte4HLgK+Wk9mbSHpty7yXAHsDH6E6JzEsJO0i6XWqLvv9c6nv6dL9B2C6pI1KjcuAa4B/Kb+jPYBjgd5LYb8LnFjqn0T1wb82W1IFRk+p5f1UexAxRiUgohOOBr5h+27b9/W+qE4Qv4fqL9a3UB2Pv5vqJOy7AGxfCPwz1SGpR6k+qLcvy/1Ime+hspxLBqjjS8DmwP1UV1P9oE//e6nOk/yG6rzAR3s7bD8OXATsCHy/7S0f2HOAz5aa7qM6OX5i6buw/HxA0g1leA7Vieh7gIuBk21fWfpOpfrd/R64kupcyRP9rdj2YuALwC+owmh3qkNpMUbJzgODIgZD0knAS2zXXR203pH0QWC27YH2rCKA7EFEDEo5JHUsMK/TtfRH0kRJ+0vaqFw++wmqvYyItiQgItZR+TLaMuAy21d3up612BQ4i+pQ3I+A/wC+2tGKYlTJIaaIiKiVPYiIiKi1Qd2ga/z48Z4+fXqny4iIGDWuv/76+21PqOvboAJi+vTpdHd3d7qMiIhRQ9Jd/fXlEFNERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStxgJC1TOFf1yea3urpI/UTCNJXy7P0r1J5dnCpe9oSXeUV55zGxExwpr8HsRq4BO2bygPNLle0hXllsK93gjsXF77Af8G7Ndyb/4uqvvTXy9pQd3TvSIiohmN7UHYvtf2DWX4UeA2nnkUYq9ZwDmuXEv1uMeJVA9qv8J27yMfrwBmNlVrREQ824icg5A0HdgL+GWfrkms+Uzc5aWtv/a6Zc+V1C2pu6enZ9hqjohYn02cPBVJSGLi5KmNrKPxgJC0FdWTtz5q+5HhXr7teba7bHdNmFB7O5GIiA3OfSuWMe2TlzLtk5dy34plA88wCI0GhKRNqMLhO7brHsu4gjUfpD65tPXXHhERI6TJq5gE/Dtwm+0v9jPZAuB95WqmVwIPl4fFXw4cWh62vh1waGmLiIgR0uRVTPtTPfT9Zkk3lrZPAVMBbJ8JLAQOB5YAjwHvL32rJH0aWFTmO9X2qgZrjYiIPhoLCNs/AzTANAY+1E/ffGB+A6VFREQb8k3qiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNPTBI0nzgzcBK2y+v6f9b4D0tdewKTChPk7sTeBR4Clhtu6upOiMiol6TexBnAzP767T9edt72t4TOBH4SZ/Hih5c+hMOEREd0FhA2L4aaPc50nOA85qqJSIi1l3Hz0FI2oJqT+OilmYDP5R0vaS5naksImJsa+wcxDp4C/DzPoeXDrC9QtIOwBWSflP2SJ6lBMhcgKlTpzZfbUTEGNHxPQhgNn0OL9leUX6uBC4G9u1vZtvzbHfZ7powYUKjhUZEjCUdDQhJ2wAHAv/R0ralpK17h4FDgVs6U2FExNjV5GWu5wEHAeMlLQdOBjYBsH1mmeytwA9t/6ll1ucDF0vqre9c2z9oqs6IiKjXWEDYntPGNGdTXQ7b2rYUmNFMVRER0a714RxERESshxIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUaCwhJ8yWtlFT7PGlJB0l6WNKN5XVSS99MSbdLWiLphKZqjIiI/jW5B3E2MHOAaX5qe8/yOhVA0jjgDOCNwG7AHEm7NVhnRETUaCwgbF8NrBrErPsCS2wvtf0kcD4wa1iLi4iIAXX6HMSrJP1a0mWSXlbaJgHLWqZZXtpqSZorqVtSd09PT5O1RkSMKZ0MiBuAabZnAF8BLhnMQmzPs91lu2vChAnDWV9ExJjWsYCw/YjtP5bhhcAmksYDK4ApLZNOLm0RETGCOhYQkl4gSWV431LLA8AiYGdJO0raFJgNLOhUnRERY9XGTS1Y0nnAQcB4ScuBk4FNAGyfCbwd+KCk1cDjwGzbBlZLOh64HBgHzLd9a1N1RkREvcYCwvacAfpPB07vp28hsLCJuiIioj2dvoopIiLWUwmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNBYSk+ZJWSrqln/73SLpJ0s2SrpE0o6XvztJ+o6TupmqMiIj+tRUQknYfxLLPBmaupf/3wIG2dwc+Dczr03+w7T1tdw1i3RERMUTt7kF8VdJ1kv5a0jbtzGD7amDVWvqvsf1gGb0WmNxmLRERMQLaCgjbrwHeA0wBrpd0rqQ3DGMdxwKXta4S+KGk6yXNXduMkuZK6pbU3dPTM4wlRUSMbRu3O6HtOyT9A9ANfBnYS5KAT9n+/mALkHQwVUAc0NJ8gO0VknYArpD0m7JHUlfXPMrhqa6uLg+2joiIWFO75yD2kHQacBvwOuAttnctw6cNduWS9gC+Dsyy/UBvu+0V5edK4GJg38GuIyIiBqfdcxBfAW4AZtj+kO0bAGzfA/zDYFYsaSrwfeC9tn/b0r6lpK17h4FDgdoroSIiojntHmJ6E/C47acAJG0EbGb7MdvfqptB0nnAQcB4ScuBk4FNAGyfCZwEPI/qBDjA6nLF0vOBi0vbxsC5tn8wuM2LiIjBajcgrgQOAf5YxrcAfgi8ur8ZbM9Z2wJtHwccV9O+FJjx7DkiImIktXuIaTPbveFAGd6imZIiImJ90G5A/EnS3r0jkl4BPN5MSRERsT5o9xDTR4ELJd0DCHgB8K6mioqIiM5rKyBsL5L0UmCX0nS77b80V1ZERHRa21+UA/YBppd59paE7XMaqSoiIjqurYCQ9C3gxcCNwFOl2UACIiJiA9XuHkQXsJvt3MoiImKMaPcqpluoTkxHRMQY0e4exHhgsaTrgCd6G20f0UhVERHRce0GxClNFhEREeufdi9z/YmkacDOtq+UtAUwrtnSIiKik9q93fcHgO8BZ5WmScAlDdUUERHrgXZPUn8I2B94BKqHBwE7NFVURER0XrsB8YTtJ3tHJG1M9T2IiIjYQLUbED+R9Clg8/Is6guB/2yurIiI6LR2A+IEoAe4GfgrYCGDfJJcRESMDu1exfQ08LXyioiIMaDdq5h+L2lp31cb882XtFJS7TOlVfmypCWSburzzImjJd1RXke3v0kRETEc1uVeTL02A94BbN/GfGcDp9P/Tf3eCOxcXvsB/wbsJ2l7qmdYd1GdDL9e0gLbD7ZZb0REDFFbexC2H2h5rbD9JeBNbcx3NbBqLZPMAs5x5VpgW0kTgcOAK2yvKqFwBTCznVojImJ4tHuIae+WV5ek/826PUuiP5OAZS3jy0tbf+11tc2V1C2pu6enZxhKinZMnDwVSUhi4uSpo275se7ynow97X7If6FleDVwJ/DOYa9mEGzPA+YBdHV15bsZI+S+FcuY9slLAbjrc28edcuPdZf3ZOxp9yqmgxta/wpgSsv45NK2AjioT/tVDdUQERE12n2i3MfX1m/7i4Nc/wLgeEnnU52kftj2vZIuBz4jabsy3aHAiYNcR0REDMK6XMW0D9UHOsBbgOuAO9Y2k6TzqPYExktaTnVl0iYAts+k+sLd4cAS4DHg/aVvlaRPA4vKok61vbaT3RERMczaDYjJwN62HwWQdArwX7aPWttMtucM0G+qGwHW9c0H5rdZX0REDLN2b7XxfODJlvEnS1tERGyg2t2DOAe4TtLFZfxI4JuNVBQREeuFdq9i+mdJlwGvKU3vt/2r5sqKiIhOa/cQE8AWwCO2/xVYLmnHhmqKiIj1QLvfpD4Z+CTPXGq6CfDtpoqKiIjOa3cP4q3AEcCfAGzfA2zdVFEREdF57QbEk+WSVANI2rK5kiIiYn3QbkB8V9JZVHdb/QBwJXl4UETEBm3Aq5gkCbgAeCnwCLALcJLtKxquLSIiOmjAgLBtSQtt7071XIaIiBgD2j3EdIOkfRqtJCIi1ivtfpN6P+AoSXdSXckkqp2LPZoqLCIiOmutASFpqu27qR4BGhERY8hAexCXUN3F9S5JF9l+2wjUFBER64GBzkGoZfhFTRYSERHrl4ECwv0MR0TEBm6gQ0wzJD1CtSexeRmGZ05SP7fR6iIiomPWGhC2xw1l4ZJmAv8KjAO+bvuzffpPAw4uo1sAO9jetvQ9Bdxc+u62fcRQaomIiHXT7mWu60zSOOAM4A3AcmCRpAW2F/dOY/tjLdN/GNirZRGP296zqfoiImLt1uV5EOtqX2CJ7aW2nwTOB2atZfo5wHkN1hMREeugyYCYBCxrGV9e2p5F0jRgR+BHLc2bSeqWdK2kI/tbiaS5Zbrunp6eYSg7IiKg2YBYF7OB79l+qqVtmu0u4N3AlyS9uG5G2/Nsd9numjBhwkjUGhExJjQZECuAKS3jk0tbndn0Obxke0X5uRS4ijXPT0RERMOaDIhFwM6SdpS0KVUILOg7kaSXAtsBv2hp207Sc8rweGB/YHHfeSMiojmNXcVke7Wk44HLqS5znW/7VkmnAt22e8NiNnB+eWJdr12BsyQ9TRVin229+ikiIprXWEAA2F4ILOzTdlKf8VNq5rsG2L3J2iIiYu3Wl5PUERGxnklARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRqNCAkzZR0u6Qlkk6o6T9GUo+kG8vruJa+oyXdUV5HN1lnREQ8W2OPHJU0DjgDeAOwHFgkaUHNs6UvsH18n3m3B04GugAD15d5H2yq3oiIWFOTexD7AktsL7X9JHA+MKvNeQ8DrrC9qoTCFcDMhuqMiIgaTQbEJGBZy/jy0tbX2yTdJOl7kqas47xImiupW1J3T0/PcNQdERF0/iT1fwLTbe9BtZfwzXVdgO15trtsd02YMGHYC4yIGKuaDIgVwJSW8cml7X/YfsD2E2X068Ar2p03IiKa1WRALAJ2lrSjpE2B2cCC1gkkTWwZPQK4rQxfDhwqaTtJ2wGHlraIiBghjV3FZHu1pOOpPtjHAfNt3yrpVKDb9gLgbyQdAawGVgHHlHlXSfo0VcgAnGp7VVO1RkTEszUWEAC2FwIL+7Sd1DJ8InBiP/POB+Y3WV9ERPSv0yepIyJiPZWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKjVaEBIminpdklLJJ1Q0/9xSYsl3STpvyVNa+l7StKN5bWg77wREdGsxh45KmkccAbwBmA5sEjSAtuLWyb7FdBl+zFJHwT+L/Cu0ve47T2bqi8iItauyT2IfYEltpfafhI4H5jVOoHtH9t+rIxeC0xusJ6IiFgHTQbEJGBZy/jy0tafY4HLWsY3k9Qt6VpJR/Y3k6S5Zbrunp6eIRUcERHPaOwQ07qQdBTQBRzY0jzN9gpJLwJ+JOlm27/rO6/tecA8gK6uLo9IwRERY0CTexArgCkt45NL2xokHQL8PXCE7Sd6222vKD+XAlcBezVYa0RE9NFkQCwCdpa0o6RNgdnAGlcjSdoLOIsqHFa2tG8n6TlleDywP9B6cjsiIhrW2CEm26slHQ9cDowD5tu+VdKpQLftBcDnga2ACyUB3G37CGBX4CxJT1OF2Gf7XP0UERENa/QchO2FwMI+bSe1DB/Sz3zXALs3WVtERKxdvkkdERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRoNCEkzJd0uaYmkE2r6nyPpgtL/S0nTW/pOLO23SzqsyTojIuLZGgsISeOAM4A3ArsBcyTt1meyY4EHbe8EnAZ8rsy7GzAbeBkwE/hqWV5ERIyQJvcg9gWW2F5q+0ngfGBWn2lmAd8sw98DXi9Jpf1820/Y/j2wpCwvIiJGyMYNLnsSsKxlfDmwX3/T2F4t6WHgeaX92j7zTqpbiaS5wNwy+kdJtw+y3vHA/YOcd7Qa0jbf9bk3/89wlevDq6Hlj7X3eVi3t+n3fJiMmfe45f0YL2mw2zytv44mA2JE2J4HzBvqciR12+4ahpJGjWzzhm+sbS9km4dTk4eYVgBTWsYnl7baaSRtDGwDPNDmvBER0aAmA2IRsLOkHSVtSnXSeUGfaRYAR5fhtwM/su3SPrtc5bQjsDNwXYO1RkREH40dYirnFI4HLgfGAfNt3yrpVKDb9gLg34FvSVoCrKIKEcp03wUWA6uBD9l+qqlaiyEfphqFss0bvrG2vZBtHjaq/mCPiIhYU75JHRERtRIQERFRa8wFxFBu/zEatbG9H5e0WNJNkv5bUr/XRI8WA21zy3Rvk2RJo/6SyHa2WdI7y3t9q6RzR7rG4dbGv+2pkn4s6Vfl3/fhnahzuEiaL2mlpFv66ZekL5ffx02S9h7ySm2PmRfVyfLfAS8CNgV+DezWZ5q/Bs4sw7OBCzpdd8PbezCwRRn+4Gje3na3uUy3NXA11Rcyuzpd9wi8zzsDvwK2K+M7dLruEdjmecAHy/BuwJ2drnuI2/xaYG/gln76DwcuAwS8EvjlUNc51vYghnL7j9FowO21/WPbj5XRa6m+czKatfMeA3ya6t5ffx7J4hrSzjZ/ADjD9oMAtleOcI3DrZ1tNvDcMrwNcM8I1jfsbF9NdbVnf2YB57hyLbCtpIlDWedYC4i623/0vYXHGrf/AHpv/zEatbO9rY6l+gtkNBtwm8uu9xTb/zWShTWonff5JcBLJP1c0rWSZo5Ydc1oZ5tPAY6StBxYCHx4ZErrmHX9/z6gUX+rjRgeko4CuoADO11LkyRtBHwROKbDpYy0jakOMx1EtZd4taTdbT/UyaIaNgc42/YXJL2K6jtXL7f9dKcLGy3G2h7EUG7/MRq1dcsSSYcAfw8cYfuJEaqtKQNt89bAy4GrJN1Jdax2wSg/Ud3O+7wcWGD7L67ukPxbqsAYrdrZ5mOB7wLY/gWwGdWN/DZUw36LorEWEEO5/cdoNOD2StoLOIsqHEb7cWkYYJttP2x7vO3ptqdTnXc5wnZ3Z8odFu38u76Eau8BSeOpDjktHcEah1s723w38HoASbtSBUTPiFY5shYA7ytXM70SeNj2vUNZ4Jg6xOQh3P5jNGpzez8PbAVcWM7F3237iI4VPURtbvMGpc1tvhw4VNJi4Cngb22P1j3jdrf5E8DXJH2M6oT1MaP4jz0knUcV8uPLeZWTgU0AbJ9JdZ7lcKrn5zwGvH/I6xzFv6+IiGjQWDvEFBERbUpARERErQRERETUSkBEREStBERERNRKQET0IenIcpfXl3a6lohOSkBEPNsc4GflZyMkjWtq2RHDJQER0ULSVsABVLdpmF3axkn6f5JuKffZ/3Bp30fSNZJ+Lek6SVtLOkbS6S3Lu1TSQWX4j5K+IOnXwKsknSRpUVnuvN67BkvaSdKVZbk3SHqxpHMkHdmy3O9IqrtLbcSwSUBErGkW8APbvwUekPQKYC4wHdjT9h7Ad8rtHS4APmJ7BnAI8PgAy96S6h79M2z/DDjd9j62Xw5sDry5TPcdqltzzwBeDdxL9Q3/YwAkbVPaN5S70cZ6KgERsaY5VM8WoPycQ/Xhf1a5/Tu2VwG7APfaXlTaHuntX4ungItaxg9W9dTCm4HXAS+TtDUwyfbFZbl/tv2Y7Z9Q3XtoQqnpojbWFzEkY+peTBFrI2l7qg/q3SWZ6h4/proxXLtWs+YfXpu1DP/Z9lNlXZsBX6V6mt0ySaf0mbbOOcBRVIe+hnyfnYiBZA8i4hlvB75le1q52+sU4PdUj7P8q3L7994guR2YKGmf0rZ16b8T2FPSRpKmUD35rE5vGNxfznu8HcD2o8Dy3vMNqp6RvkWZ9mzgo2W6xcO21RH9SEBEPGMOcHGftouAiVS3jr6pnGB+d3nM5buAr5S2K6g+9H9OFSqLgS8DN9StqDyo52vALVR3JG3dS3kv8DeSbgKuAV5Q5vkDcBvwjaFuaEQ7cjfXiFGi7EncDOxt++FO1xMbvuxBRIwC5al/twFfSTjESMkeRERE1MoeRERE1EpARERErQRERETUSkBEREStBERERNT6/5WLAWlxQhHkAAAAAElFTkSuQmCC", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import os\n", - "import json\n", - "import matplotlib.pyplot as plt\n", - "\n", - "def load_jsonl_files_recursively(dir_path):\n", - " all_data = []\n", - " \n", - " for root, _, files in os.walk(dir_path):\n", - " for file in files:\n", - " if file.endswith(\".jsonl\"):\n", - " file_path = os.path.join(root, file)\n", - " with open(file_path, \"r\") as f:\n", - " file_data = [json.loads(line) for line in f]\n", - " all_data.extend(file_data)\n", - " \n", - " return all_data\n", - "\n", - "def extract_accuracies(data):\n", - " accuracies = []\n", - " for record in data:\n", - " if 'final_report' in record:\n", - " accuracy = record['final_report']['accuracy']\n", - " accuracies.append(accuracy)\n", - " return accuracies\n", - "\n", - "# Load the data recursively\n", - "dir_path = \"evals\"\n", - "data = load_jsonl_files_recursively(dir_path)\n", - "\n", - "# Extract accuracies from the data\n", - "accuracies = extract_accuracies(data)\n", - "\n", - "# Plot the accuracies in a histogram chart\n", - "plt.hist(accuracies, bins=100, range=(0, 1), edgecolor='black')\n", - "plt.xlabel(\"Accuracy\")\n", - "plt.ylabel(\"Frequency\")\n", - "plt.title(\"Accuracy Histogram\")\n", - "plt.show()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Counts for each eval_name:\n", - "test-match.s1.simple-v0: 22\n", - "None: 45\n", - "test-fuzzy-match.s1.simple-v0: 2\n" - ] - } - ], - "source": [ - "import os\n", - "import json\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "\n", - "def print_graph():\n", - " directory = 'evals/'\n", - " files = os.listdir(directory)\n", - " \n", - " eval_name_counter = {}\n", - " \n", - " for file in files:\n", - " if file.endswith(\".jsonl\"):\n", - " with open(os.path.join(directory, file), 'r') as f:\n", - " jsonl_content = f.read()\n", - " \n", - " # Read the JSONL content into a DataFrame\n", - " data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n", - " df = pd.DataFrame(data)\n", - "\n", - " if 'spec' not in df.columns:\n", - " continue\n", - "\n", - " # Extract the \"eval_name\" from the \"spec\" dictionaries\n", - " df['eval_name'] = df['spec'].apply(lambda x: x['eval_name'] if isinstance(x, dict) else None)\n", - "\n", - " for eval_name in df['eval_name']:\n", - " if eval_name not in eval_name_counter:\n", - " eval_name_counter[eval_name] = 0\n", - " eval_name_counter[eval_name] += 1\n", - "\n", - " # Print the counts\n", - " print(\"Counts for each eval_name:\")\n", - " for eval_name, count in eval_name_counter.items():\n", - " print(f\"{eval_name}: {count}\")\n", - "\n", - "print_graph()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Counts for each eval_name:\n", - "test-match.s1.simple-v0: 22\n", - "test-fuzzy-match.s1.simple-v0: 2\n" - ] - } - ], - "source": [ - "import os\n", - "import json\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "\n", - "def print_graph():\n", - " directory = 'evals/'\n", - " files = os.listdir(directory)\n", - " \n", - " eval_name_counter = {}\n", - " \n", - " for file in files:\n", - " if file.endswith(\".jsonl\"):\n", - " with open(os.path.join(directory, file), 'r') as f:\n", - " jsonl_content = f.read()\n", - " \n", - " # Read the JSONL content into a DataFrame\n", - " data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n", - " df = pd.DataFrame(data)\n", - "\n", - " if 'spec' not in df.columns:\n", - " continue\n", - "\n", - " # Filter the DataFrame to only include rows with the \"spec\" key\n", - " spec_df = df[df['spec'].notna()].copy()\n", - "\n", - " # Extract the \"eval_name\" from the \"spec\" dictionaries\n", - " spec_df.loc[:, 'eval_name'] = spec_df['spec'].apply(lambda x: x['eval_name'])\n", - "\n", - " for eval_name in spec_df['eval_name']:\n", - " if eval_name not in eval_name_counter:\n", - " eval_name_counter[eval_name] = 0\n", - " eval_name_counter[eval_name] += 1\n", - "\n", - " # Print the counts\n", - " print(\"Counts for each eval_name:\")\n", - " for eval_name, count in eval_name_counter.items():\n", - " print(f\"{eval_name}: {count}\")\n", - "\n", - "print_graph()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.10.0 64-bit", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 000000000..12a0390ef --- /dev/null +++ b/poetry.lock @@ -0,0 +1,101 @@ +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.1.1" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, + {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "packaging" +version = "23.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, +] + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pytest" +version = "7.3.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"}, + {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.9" +content-hash = "c5b989915c413ab901c39dd0c4f3b0fe203558c2879952a2460a52bda4f3e857" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..2c099a5b8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,23 @@ +[tool.poetry] +name = "agbenchmark" +version = "0.1.0" +description = "Benchmarking the performance of agents far and wide, regardless of how they are set up and how they work" +authors = ["Silen Naihin "] +license = "MIT" +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.9" +pytest = "^7.3.2" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = "-ra -q" +testpaths = [ + "tests", "benchmark/challenges", +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index b1c5914ad..000000000 --- a/requirements.txt +++ /dev/null @@ -1,81 +0,0 @@ -aiodocker==0.21.0 -aiohttp==3.8.4 -aiosignal==1.3.1 -asn1crypto==1.5.1 -async-timeout==4.0.2 -attrs==23.1.0 -backoff==2.2.1 -blobfile==2.0.1 -cachetools==5.3.0 -certifi==2022.12.7 -cffi==1.15.1 -charset-normalizer==2.1.1 -click==8.1.3 -colorama==0.4.6 -contourpy==1.0.7 -cryptography==40.0.2 -cycler==0.11.0 -dataclasses-json==0.5.7 -docker==6.0.1 -evals==1.0.2.post1 -filelock==3.11.0 -fire==0.5.0 -fonttools==4.39.3 -frozenlist==1.3.3 -gptcache==0.1.13 -greenlet==2.0.2 -idna==3.4 -importlib-resources==5.12.0 -joblib==1.2.0 -kiwisolver==1.4.4 -langchain==0.0.142 -langdetect==1.0.9 -lxml==4.9.2 -lz4==4.3.2 -marshmallow==3.19.0 -marshmallow-enum==1.5.1 -matplotlib==3.7.1 -mock==5.0.2 -multidict==6.0.4 -mypy==1.2.0 -mypy-extensions==1.0.0 -nltk==3.8.1 -numexpr==2.8.4 -numpy==1.24.2 -openai==0.27.4 -openapi-schema-pydantic==1.2.4 -oscrypto==1.3.0 -packaging==23.1 -pandas==1.5.3 -Pillow==9.5.0 -portalocker==2.7.0 -pyarrow==10.0.1 -pycparser==2.21 -pycryptodomex==3.17 -pydantic==1.10.7 -PyJWT==2.6.0 -pyOpenSSL==23.1.1 -pyparsing==3.0.9 -python-dateutil==2.8.2 -pytz==2023.3 -PyYAML==6.0 -pyzstd==0.15.6 -regex==2023.3.23 -requests==2.28.2 -sacrebleu==2.3.1 -setuptools-scm==7.1.0 -six==1.16.0 -snowflake-connector-python==3.0.2 -SQLAlchemy==1.4.47 -tabulate==0.9.0 -tenacity==8.2.2 -termcolor==2.2.0 -tiktoken==0.3.3 -tomli==2.0.1 -tqdm==4.65.0 -typing-inspect==0.8.0 -typing_extensions==4.5.0 -urllib3==1.26.15 -websocket-client==1.5.1 -yarl==1.8.2 -zipp==3.15.0 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_workspace_manager.py b/tests/test_workspace_manager.py new file mode 100644 index 000000000..e69de29bb -- cgit v1.2.3 From f37981c3884264c50f2af93799f3667b5dc42cca Mon Sep 17 00:00:00 2001 From: scarletpan Date: Mon, 19 Jun 2023 12:39:34 +0000 Subject: init first challenge template --- agbenchmark/benchmark/challenges/Challenge.py | 11 +++++++ .../benchmark/challenges/retrieval/r1_test.py | 29 +++++++++++++++++ data/README.md | 37 ++++++++++++++++++++++ data/retrieval/r1_test_data_0.json | 10 ++++++ data/retrieval/r1_test_data_1.json | 10 ++++++ examples/basic_gpt_agent.py | 26 +++++++++++++++ 6 files changed, 123 insertions(+) create mode 100644 data/README.md create mode 100644 data/retrieval/r1_test_data_0.json create mode 100644 data/retrieval/r1_test_data_1.json create mode 100644 examples/basic_gpt_agent.py diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py index e69de29bb..bed522a85 100644 --- a/agbenchmark/benchmark/challenges/Challenge.py +++ b/agbenchmark/benchmark/challenges/Challenge.py @@ -0,0 +1,11 @@ +import json + +class Challenge(object): + def __init__(self, json_data): + self.json_data = json_data + + @classmethod + def from_json_file(cls, json_file): + with open(json_file) as f: + json_data = json.load(f) + return cls(json_data) \ No newline at end of file diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py index e69de29bb..f300d094c 100644 --- a/agbenchmark/benchmark/challenges/retrieval/r1_test.py +++ b/agbenchmark/benchmark/challenges/retrieval/r1_test.py @@ -0,0 +1,29 @@ +from ..Challenge import Challenge + + +class RetrievelChallenge(Challenge): + """ Chanllenge for information-retrieval """ + def __init__(self, json_data): + self.json_data = json_data + assert self.json_data["category"] == "information-retrieval" + + @property + def agent_input(self): + return self.json_data["query"] + + def scoring(self, content): + for should_contain_word in self.json_data["ground"]["should_contain"]: + if should_contain_word not in content: + return 0. + + for should_not_contain_word in self.json_data["ground"]["should_not_contain"]: + if should_not_contain_word in content: + return 0. + return 1. + + def run(self, output_file): + output = open(output_file).read().strip() + + score = self.scoring(output) + + return score \ No newline at end of file diff --git a/data/README.md b/data/README.md new file mode 100644 index 000000000..d3e32b563 --- /dev/null +++ b/data/README.md @@ -0,0 +1,37 @@ +# Challenges Data Schema of Benchmark + +## General challenges +Input: +- **category** (str): information-retrieval +- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"] + + + +## Information-retrieval challenges +Input: +- **category** (str): information-retrieval +- **query** (str): the question need to be solve. +- **ground** (dict): The ground truth. + - **answer** (str): The raw text of ground truth answer + - **should_contain** (list): the exact strings that is required in the final answer + - **should_not_contain** (list): the exact strings that should not be in the final answer +- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"] + +Example: +```python +{ + "category": "information-retrieval", + "query": "what is the capital of America", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"] + }, + "difficulty_level": "easy" +} +``` + + + +Output: +- **score** (float): scores range from [0, 1] \ No newline at end of file diff --git a/data/retrieval/r1_test_data_0.json b/data/retrieval/r1_test_data_0.json new file mode 100644 index 000000000..a64f7e0cc --- /dev/null +++ b/data/retrieval/r1_test_data_0.json @@ -0,0 +1,10 @@ +{ + "category": "information-retrieval", + "query": "what is the capital of America", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"] + }, + "difficulty_level": "easy" +} \ No newline at end of file diff --git a/data/retrieval/r1_test_data_1.json b/data/retrieval/r1_test_data_1.json new file mode 100644 index 000000000..73dec4cdd --- /dev/null +++ b/data/retrieval/r1_test_data_1.json @@ -0,0 +1,10 @@ +{ + "category": "information-retrieval", + "query": "The Nobel Prize in Literature 2012", + "ground": { + "answer": "Mo Yan", + "should_contain": ["Mo Yan"], + "should_not_contain": ["Murakami Haruki"] + }, + "difficulty_level": "easy" +} \ No newline at end of file diff --git a/examples/basic_gpt_agent.py b/examples/basic_gpt_agent.py new file mode 100644 index 000000000..e2cc380c8 --- /dev/null +++ b/examples/basic_gpt_agent.py @@ -0,0 +1,26 @@ +import json +import openai +from agbenchmark.benchmark.challenges.retrieval.r1_test import RetrievelChallenge + + +def basic_gpt_agent(challenge_file): + challenge = RetrievelChallenge.from_json_file(challenge_file) + + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo-0613", + messages=[{"role": "user", "content": challenge.agent_input}]) + answer = response["choices"][0]["message"]["content"] + + output_file = "./basic_gpt_agent_retrieval_results.txt" + with open(output_file, "w") as f: + f.write(answer) + + print("QUERY : ", challenge.agent_input) + print("AGENT ANSWER: ", answer) + + score = challenge.run(output_file) + + print("AGENT SCORE : ", score) + +if __name__ == "__main__": + basic_gpt_agent("./data/retrieval/r1_test_data_1.json") -- cgit v1.2.3 From 1eb278f3cc36ad5087f3ec30ea8c4e6fc8efca3a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 19 Jun 2023 09:53:30 -0400 Subject: Update README.md --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 820c0f51e..02f792b70 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,14 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work -#### MVP: function calls api, api returns presigned url, folder is uploaded, write file challenge is measured, score is given +Simple boilerplate code that spins up a webserver to plug their agent into. We call multiple tasks by invoking different pytest commands on folders and once the agent stops or reaches 50 loops (which they will have to define). We handle the deletion of files after a run loop ends. Then we call call the POST request for the next task. Then we will spit out a combined benchmark once all tests run -#### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x +- Agent adds tests by adding to our repo +- Agent abstracted from benchmark +- Scalable (parallel servers running tests) +- Better standardization + +##### Diagrams (out of date, cloud oriented): https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x ## Contributing -- cgit v1.2.3 From b7deb984f7749db4ba3c62dc0a34ddbda966af02 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Wed, 21 Jun 2023 11:43:18 -0400 Subject: start click, fixtures, types, challenge creation, mock run -stable (#37) --- README.md | 157 +++-- agbenchmark/Challenge.py | 32 + agbenchmark/benchmark/__init__.py | 0 agbenchmark/benchmark/benchmark.py | 1 - agbenchmark/benchmark/challenges/Challenge.py | 11 - agbenchmark/benchmark/challenges/__init__.py | 0 .../benchmark/challenges/adaptability/a1_test.py | 0 .../challenges/basic_abilities/browse_test.py | 0 .../challenges/basic_abilities/read_file_test.py | 0 .../basic_abilities/remember_context_test.py | 0 .../challenges/basic_abilities/write_file_test.py | 0 agbenchmark/benchmark/challenges/code/c1_test.py | 0 agbenchmark/benchmark/challenges/memory/m1_test.py | 0 .../benchmark/challenges/retrieval/r1_test.py | 29 - agbenchmark/benchmark/challenges/utils.py | 0 .../challenges/web_navigation/wn1_test.py | 0 .../benchmark/challenges/writing/w1_test.py | 0 agbenchmark/benchmark/run.py | 1 - agbenchmark/challenges/README.md | 42 ++ agbenchmark/challenges/__init__.py | 0 agbenchmark/challenges/adaptability/a1_test.py | 0 agbenchmark/challenges/code/c1_test.py | 0 agbenchmark/challenges/define_task_types.py | 29 + agbenchmark/challenges/memory/m1_test.py | 0 agbenchmark/challenges/retrieval/Retrieval.py | 27 + agbenchmark/challenges/retrieval/r1/r1_data.json | 11 + agbenchmark/challenges/retrieval/r1/r1_test.py | 25 + agbenchmark/challenges/web_navigation/wn1_test.py | 0 agbenchmark/challenges/writing/w1_test.py | 0 agbenchmark/config.json | 5 + agbenchmark/conftest.py | 43 ++ agbenchmark/metrics.py | 10 + agbenchmark/mocks/basic_gpt_agent.py | 20 + agbenchmark/mocks/tests/retrieval_manual.py | 10 + agbenchmark/server/__init__.py | 0 agbenchmark/server/api.py | 0 agbenchmark/server/utils.py | 0 agbenchmark/start_benchmark.py | 48 ++ agbenchmark/tests/basic_abilities/browse_test.py | 0 .../tests/basic_abilities/read_file_test.py | 0 .../tests/basic_abilities/remember_context_test.py | 0 .../tests/basic_abilities/write_file_test.py | 0 agbenchmark/utils.py | 1 + agbenchmark/workspace/__init__.py | 0 agbenchmark/workspace/cloud_services/aws.py | 0 agbenchmark/workspace/workspace_manager.py | 1 - data/README.md | 37 -- data/retrieval/r1_test_data_0.json | 10 - data/retrieval/r1_test_data_1.json | 10 - examples/basic_gpt_agent.py | 26 - file_to_check.txt | 1 + poetry.lock | 669 ++++++++++++++++++++- pyproject.toml | 13 +- tests/__init__.py | 0 tests/test_api.py | 0 tests/test_benchmark.py | 0 tests/test_workspace_manager.py | 0 57 files changed, 1052 insertions(+), 217 deletions(-) create mode 100644 agbenchmark/Challenge.py delete mode 100644 agbenchmark/benchmark/__init__.py delete mode 100644 agbenchmark/benchmark/benchmark.py delete mode 100644 agbenchmark/benchmark/challenges/Challenge.py delete mode 100644 agbenchmark/benchmark/challenges/__init__.py delete mode 100644 agbenchmark/benchmark/challenges/adaptability/a1_test.py delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/browse_test.py delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py delete mode 100644 agbenchmark/benchmark/challenges/code/c1_test.py delete mode 100644 agbenchmark/benchmark/challenges/memory/m1_test.py delete mode 100644 agbenchmark/benchmark/challenges/retrieval/r1_test.py delete mode 100644 agbenchmark/benchmark/challenges/utils.py delete mode 100644 agbenchmark/benchmark/challenges/web_navigation/wn1_test.py delete mode 100644 agbenchmark/benchmark/challenges/writing/w1_test.py delete mode 100644 agbenchmark/benchmark/run.py create mode 100644 agbenchmark/challenges/README.md create mode 100644 agbenchmark/challenges/__init__.py create mode 100644 agbenchmark/challenges/adaptability/a1_test.py create mode 100644 agbenchmark/challenges/code/c1_test.py create mode 100644 agbenchmark/challenges/define_task_types.py create mode 100644 agbenchmark/challenges/memory/m1_test.py create mode 100644 agbenchmark/challenges/retrieval/Retrieval.py create mode 100644 agbenchmark/challenges/retrieval/r1/r1_data.json create mode 100644 agbenchmark/challenges/retrieval/r1/r1_test.py create mode 100644 agbenchmark/challenges/web_navigation/wn1_test.py create mode 100644 agbenchmark/challenges/writing/w1_test.py create mode 100644 agbenchmark/config.json create mode 100644 agbenchmark/conftest.py create mode 100644 agbenchmark/metrics.py create mode 100644 agbenchmark/mocks/basic_gpt_agent.py create mode 100644 agbenchmark/mocks/tests/retrieval_manual.py delete mode 100644 agbenchmark/server/__init__.py delete mode 100644 agbenchmark/server/api.py delete mode 100644 agbenchmark/server/utils.py create mode 100644 agbenchmark/start_benchmark.py create mode 100644 agbenchmark/tests/basic_abilities/browse_test.py create mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py create mode 100644 agbenchmark/tests/basic_abilities/remember_context_test.py create mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py create mode 100644 agbenchmark/utils.py delete mode 100644 agbenchmark/workspace/__init__.py delete mode 100644 agbenchmark/workspace/cloud_services/aws.py delete mode 100644 agbenchmark/workspace/workspace_manager.py delete mode 100644 data/README.md delete mode 100644 data/retrieval/r1_test_data_0.json delete mode 100644 data/retrieval/r1_test_data_1.json delete mode 100644 examples/basic_gpt_agent.py create mode 100644 file_to_check.txt delete mode 100644 tests/__init__.py delete mode 100644 tests/test_api.py delete mode 100644 tests/test_benchmark.py delete mode 100644 tests/test_workspace_manager.py diff --git a/README.md b/README.md index 02f792b70..216f1202c 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,8 @@ -# agbenchmark +# Auto-GPT Benchmark A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work -Simple boilerplate code that spins up a webserver to plug their agent into. We call multiple tasks by invoking different pytest commands on folders and once the agent stops or reaches 50 loops (which they will have to define). We handle the deletion of files after a run loop ends. Then we call call the POST request for the next task. Then we will spit out a combined benchmark once all tests run - -- Agent adds tests by adding to our repo -- Agent abstracted from benchmark -- Scalable (parallel servers running tests) -- Better standardization - -##### Diagrams (out of date, cloud oriented): https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x +##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x ## Contributing @@ -19,7 +12,7 @@ Simple boilerplate code that spins up a webserver to plug their agent into. We c - To add requirements `poetry add requirement`. - To run in venv `poetry run python script.py` -Feel free to merge with `main` at will (but also to ask for review) - if you can't send msg in R&D chat for access. +Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit @@ -27,110 +20,96 @@ Let people know what beautiful code you write does, document everything well Share your progress :) -## Api +## How this works -FastAPI with REST, import requests +1. `pip install auto-gpt-benchmarks` +2. Add boilerplate code to start webserver to your agent (run loop and stop condition) +3. `agbenchmark start --challenge challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory +4. We call the server to run the agent for each test +5. Show pass rate of tests, logs, and any other metrics -``` -POST hostname:8080/challenges -{ - "test_name": "" - "challenge": "memory" - optional -} -``` +### To run the basic existing mock (June 21) -## Auth: +1. clone the repo `auto-gpt-benchmarks` +2. `pip install poetry` +3. `poetry shell` +4. `poetry install` +5. `agbenchmark start` + Keep config the same and watch the logs :) -get preSignedUrl from API +#### Bonuses + +- You can adds tests by git cloning auto-gpt-benchmarks to your repo +- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server +- Simple, easy to use +- Don't have to deal with cloud or parallelization yet + +### Pytest + +to create a test: ``` -POST preSignedUrl -{ - "artifacts": [{}] -} +@pytest.mark.parametrize( +"server_response", +["VARIABLE"], # VARIABLE = the query/goal you provide to the model +indirect=True, +) +@pytest.mark.(VARIABLE) # VARIABLE = category of the test +def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts +assert os.path.exists(os.path.join(workspace, "file_to_check.txt")) ``` -## Workspace +## Api + +FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server -Kubernetes with AWS3 or GCP +## Workspace -## Challenges +Defined by the user on config #### Dataset Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ -#### Simple challenge creation through a DSL (domain specific language) +## Repo ``` -Challenge TicTacToeCoding - Description "The agent should implement a basic tic-tac-toe game in Python." - Artifacts { - Code "tictactoe.py" - } - Tasks { - Code "Write a function to initialize the game board." - Code "Write a function to handle a player's turn." - Code "Write a function to check for a winning move." - Test "Write tests for the blog post model, serializer, and view." - Command "Run Django's test suite to ensure everything is working as expected." - } - SuccessCriteria { - Correctness "The game should correctly alternate between two players." - Correctness "The game should correctly identify a winning move." - Efficiency "The game should not use unnecessary computational resources." - Design "The solution should follow good practices for Django and Django Rest Framework." - } -EndChallenge +|-- auto-gpt-benchmarks/ **main project directory** +| |-- metrics.py **combining scores, metrics, final evaluation** +| |-- start_benchmark.py **entry point from cli** +| |-- conftest.py **shared fixtures across all tests** +| |-- Challenge.py **easy challenge creation class?** +| |-- config.json **hostname, port, workspace folder** +| |-- challenges/ **challenges across different domains** +| | |-- adaptability/ +| | |-- basic_abilities/ +| | |-- code/ +| | |-- memory/ +| | |-- retrieval/ +| | |-- web_navigation/ +| | |-- writing/ +| |-- tests/ **challenges across different metrics** +| | |-- basic_abilities/ +| | |-- interface/ +| |-- workspace/ **workspace related func** +| | |-- **init**.py +| | |-- workspace_manager.py **creation, deletion** ``` -#### Validators - -Designed to handle specific types of output (e.g., text, code, structured data) +### Easy Challenge Creation -#### Logging - -Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc +tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval #### Written Challenges For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore -## Repo +#### Validators -``` -|-- agbenchmark/ **main project directory** -| |-- **init**.py -| |-- server/ -| | |-- **init**.py -| | |-- api.py **opens server on host and exposes urls** -| | |-- utils.py -| |-- benchmark/ -| | |-- **init**.py -| | |-- benchmark.py **combining scores, metrics, final evaluation** -| | |-- run.py **entry point. sets everything up** -| | |-- challenges/ **challenges across different metrics** -| | | |-- **init**.py -| | | |-- Challenge.py **easy challenge creation through Challenge class. potentially how DSL is defined. may need to inherit challenge class like Adaptability(Challenge)** -| | | |-- utils.py -| | | |-- adaptability.py -| | | |-- basic_abilities.py -| | | |-- code.py -| | | |-- memory.py -| | | |-- retrieval.py -| | | |-- web_navigation.py -| | | |-- writing.py -| |-- workspace/ **workspace related func** -| | |-- **init**.py -| | |-- workspace_manager.py **creation, deletion, preSignedUrl generation** -| | |-- cloud_services/ -| | | |-- **init**.py -| | | |-- aws.py **not finalized, but write, read, and del files** -|-- tests/ **test func of agbenchmark** -| |-- **init**.py -| |-- test_api.py -| |-- test_benchmark.py -| |-- test_workspace_manager.py -``` +Designed to handle specific types of output (e.g., text, code, structured data) + +#### Logging + +Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py new file mode 100644 index 000000000..20bf55853 --- /dev/null +++ b/agbenchmark/Challenge.py @@ -0,0 +1,32 @@ +import os +from typing import Optional + + +class Challenge: + """The parent class to all specific challenges classes. + Defines helper methods for running a challenge""" + + @staticmethod + def open_file(workspace: str, filename: str): + script_dir = os.path.abspath(workspace) + workspace_dir = os.path.join(script_dir, filename) + with open(workspace_dir, "r") as f: + return f.read() + + @staticmethod + def write_to_file(workspace: str, filename: str, content: str): + script_dir = os.path.abspath(workspace) + print("Writing file at", script_dir) + workspace_dir = os.path.join(script_dir, filename) + + # Open the file in write mode. + with open(workspace_dir, "w") as f: + # Write the content to the file. + f.write(content) + + def get_filenames_in_workspace(self, workspace: str): + return [ + filename + for filename in os.listdir(workspace) + if os.path.isfile(os.path.join(workspace, filename)) + ] diff --git a/agbenchmark/benchmark/__init__.py b/agbenchmark/benchmark/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/benchmark.py b/agbenchmark/benchmark/benchmark.py deleted file mode 100644 index 2f8124272..000000000 --- a/agbenchmark/benchmark/benchmark.py +++ /dev/null @@ -1 +0,0 @@ -# how well the agent did on the challenges, the metrics calculation diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py deleted file mode 100644 index bed522a85..000000000 --- a/agbenchmark/benchmark/challenges/Challenge.py +++ /dev/null @@ -1,11 +0,0 @@ -import json - -class Challenge(object): - def __init__(self, json_data): - self.json_data = json_data - - @classmethod - def from_json_file(cls, json_file): - with open(json_file) as f: - json_data = json.load(f) - return cls(json_data) \ No newline at end of file diff --git a/agbenchmark/benchmark/challenges/__init__.py b/agbenchmark/benchmark/challenges/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/adaptability/a1_test.py b/agbenchmark/benchmark/challenges/adaptability/a1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py b/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py b/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/code/c1_test.py b/agbenchmark/benchmark/challenges/code/c1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/memory/m1_test.py b/agbenchmark/benchmark/challenges/memory/m1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py deleted file mode 100644 index f300d094c..000000000 --- a/agbenchmark/benchmark/challenges/retrieval/r1_test.py +++ /dev/null @@ -1,29 +0,0 @@ -from ..Challenge import Challenge - - -class RetrievelChallenge(Challenge): - """ Chanllenge for information-retrieval """ - def __init__(self, json_data): - self.json_data = json_data - assert self.json_data["category"] == "information-retrieval" - - @property - def agent_input(self): - return self.json_data["query"] - - def scoring(self, content): - for should_contain_word in self.json_data["ground"]["should_contain"]: - if should_contain_word not in content: - return 0. - - for should_not_contain_word in self.json_data["ground"]["should_not_contain"]: - if should_not_contain_word in content: - return 0. - return 1. - - def run(self, output_file): - output = open(output_file).read().strip() - - score = self.scoring(output) - - return score \ No newline at end of file diff --git a/agbenchmark/benchmark/challenges/utils.py b/agbenchmark/benchmark/challenges/utils.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/challenges/writing/w1_test.py b/agbenchmark/benchmark/challenges/writing/w1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/benchmark/run.py b/agbenchmark/benchmark/run.py deleted file mode 100644 index b07ac6b55..000000000 --- a/agbenchmark/benchmark/run.py +++ /dev/null @@ -1 +0,0 @@ -# running all of the different challenges diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md new file mode 100644 index 000000000..40281c99b --- /dev/null +++ b/agbenchmark/challenges/README.md @@ -0,0 +1,42 @@ +# Challenges Data Schema of Benchmark + +## General challenges + +Input: + +- **category** (str): information-retrieval +- **difficulty**(str): the difficulty of this query. choices from + +## Information-retrieval challenges + +Input: + +- **category** (str): information-retrieval +- **task** (str): the question the agent needs to be solve. +- **ground** (dict): The ground truth. + - **answer** (str): The raw text of ground truth answer + - **should_contain** (list): the exact strings that is required in the final answer + - **should_not_contain** (list): the exact strings that should not be in the final answer + - **files**: files that the are used for retrieval +- **difficulty**(str): the difficulty of this query. choices from + +Example: + +```python +{ + "category": "retrieval", + "task": "What is the capital of America?", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": ["file_to_check.txt"] + }, + "difficulty": "easy" +} + +``` + +Output: + +- **score** (float): scores range from [0, 1] diff --git a/agbenchmark/challenges/__init__.py b/agbenchmark/challenges/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/adaptability/a1_test.py b/agbenchmark/challenges/adaptability/a1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/c1_test.py b/agbenchmark/challenges/code/c1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py new file mode 100644 index 000000000..94b5ba533 --- /dev/null +++ b/agbenchmark/challenges/define_task_types.py @@ -0,0 +1,29 @@ +from pydantic import BaseModel +from typing import List, Optional +import json +import os + + +class Ground(BaseModel): + answer: str + should_contain: Optional[List[str]] + should_not_contain: Optional[List[str]] + files: List[str] + + +class Challenge(BaseModel): + category: str + task: str + ground: Ground + difficulty: str + + def serialize(self, path: str) -> None: + with open(path, "w") as file: + file.write(self.json()) + + @staticmethod + def deserialize(path: str) -> "Challenge": + print("Deserializing", path) + with open(path, "r") as file: + data = json.load(file) + return Challenge(**data) diff --git a/agbenchmark/challenges/memory/m1_test.py b/agbenchmark/challenges/memory/m1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py new file mode 100644 index 000000000..2db22ae4d --- /dev/null +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -0,0 +1,27 @@ +from agbenchmark.Challenge import Challenge +from agbenchmark.challenges.define_task_types import Ground + + +class RetrievalChallenge(Challenge): + """Challenge for information-retrieval""" + + def scoring(self, content: str, ground: Ground): + if ground.should_contain: + for should_contain_word in ground.should_contain: + if should_contain_word not in content: + return 0.0 + else: + print( + f"Word that should exist: {should_contain_word} exists in the content" + ) + + if ground.should_not_contain: + for should_not_contain_word in ground.should_not_contain: + if should_not_contain_word in content: + return 0.0 + else: + print( + f"Word that should not exist: {should_not_contain_word} does not exist in the content" + ) + + return 1.0 diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json new file mode 100644 index 000000000..b5d5701ea --- /dev/null +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -0,0 +1,11 @@ +{ + "category": "retrieval", + "task": "What is the capital of America?", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": ["file_to_check.txt"] + }, + "difficulty": "easy" +} diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py new file mode 100644 index 000000000..195de15f8 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -0,0 +1,25 @@ +import pytest +from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge +from agbenchmark.challenges.define_task_types import Challenge, Ground +import os + +data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json")) + + +class TestRetrieval1(RetrievalChallenge): + """The first information-retrieval challenge""" + + @pytest.mark.parametrize( + "server_response", + [data.task], + indirect=True, + ) + @pytest.mark.retrieval + def test_retrieval(self, workspace): + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/challenges/web_navigation/wn1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/writing/w1_test.py b/agbenchmark/challenges/writing/w1_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/config.json b/agbenchmark/config.json new file mode 100644 index 000000000..d285627e5 --- /dev/null +++ b/agbenchmark/config.json @@ -0,0 +1,5 @@ +{ + "hostname": "localhost", + "port": 8080, + "workspace": "agbenchmark/mocks/workspace" +} diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py new file mode 100644 index 000000000..b3ca086d8 --- /dev/null +++ b/agbenchmark/conftest.py @@ -0,0 +1,43 @@ +import json +import os +import pytest +import shutil +from agbenchmark.mocks.tests.retrieval_manual import mock_retrieval +import requests + + +@pytest.fixture(scope="module") +def config(): + config_file = os.path.abspath("agbenchmark/config.json") + print(f"Config file: {config_file}") + with open(config_file, "r") as f: + config = json.load(f) + return config + + +@pytest.fixture +def workspace(config): + yield config["workspace"] + # teardown after test function completes + for filename in os.listdir(config["workspace"]): + file_path = os.path.join(config["workspace"], filename) + try: + if os.path.isfile(file_path) or os.path.islink(file_path): + os.unlink(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + print(f"Failed to delete {file_path}. Reason: {e}") + + +@pytest.fixture(autouse=True) +def server_response(request, config): + task = request.param # The task is passed in indirectly + print(f"Server starting at {request.module}") + # response = requests.post( + # f"{config['hostname']}:{config['port']}", data={"task": task} + # ) + # assert ( + # response.status_code == 200 + # ), f"Request failed with status code {response.status_code}" + mock_retrieval(task, config["workspace"]) diff --git a/agbenchmark/metrics.py b/agbenchmark/metrics.py new file mode 100644 index 000000000..bf72570a7 --- /dev/null +++ b/agbenchmark/metrics.py @@ -0,0 +1,10 @@ +# how well the agent did on the challenges, the metrics calculation for the future if we're tracking specific tests + +# POTENTIAL METRICS +# pass/fail - in the future could have a % metric of challenge completed, milestones achieved +# convergence - how long it took to get the result +# difficulty of the task - defined by previous comparing to runs against other agents +# consistency +# time passed +# budget used +# divergence (distractions not related to task at hand) diff --git a/agbenchmark/mocks/basic_gpt_agent.py b/agbenchmark/mocks/basic_gpt_agent.py new file mode 100644 index 000000000..6aac3d191 --- /dev/null +++ b/agbenchmark/mocks/basic_gpt_agent.py @@ -0,0 +1,20 @@ +import json +import openai + + +def basic_gpt_agent(query) -> str: + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}] + ) + + answer = response["choices"][0]["message"]["content"] # type: ignore + + print("QUERY : ", query) + print("AGENT ANSWER: ", answer) + + return answer + + +if __name__ == "__main__": + # server boilerplate example here + basic_gpt_agent("") diff --git a/agbenchmark/mocks/tests/retrieval_manual.py b/agbenchmark/mocks/tests/retrieval_manual.py new file mode 100644 index 000000000..ccb482132 --- /dev/null +++ b/agbenchmark/mocks/tests/retrieval_manual.py @@ -0,0 +1,10 @@ +from ..basic_gpt_agent import basic_gpt_agent +from agbenchmark.Challenge import Challenge + + +def mock_retrieval(task: str, workspace: str): + # Call the basic_gpt_agent to get a response. + response = basic_gpt_agent(task) + + # Open the file in write mode. + Challenge.write_to_file(workspace, "file_to_check.txt", response) diff --git a/agbenchmark/server/__init__.py b/agbenchmark/server/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/server/api.py b/agbenchmark/server/api.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/server/utils.py b/agbenchmark/server/utils.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py new file mode 100644 index 000000000..79f308435 --- /dev/null +++ b/agbenchmark/start_benchmark.py @@ -0,0 +1,48 @@ +import click +import pytest +import json +import os + + +@click.group() +def cli(): + pass + + +@cli.command() +@click.option("--challenge", default=None, help="Specific challenge to run") +def start(challenge): + """Start the benchmark tests. If a challenge flag is is provided, run the challenges with that mark.""" + with open("agbenchmark/config.json", "r") as f: + config = json.load(f) + + print("Current configuration:") + for key, value in config.items(): + print(f"{key}: {value}") + + update_config = click.confirm( + "\nDo you want to update these parameters?", default=False + ) + if update_config: + config["hostname"] = click.prompt( + "\nPlease enter a new hostname", default=config["hostname"] + ) + config["port"] = click.prompt("Please enter a new port", default=config["port"]) + config["workspace"] = click.prompt( + "Please enter a new workspace path", default=config["workspace"] + ) + + with open("agbenchmark/config.json", "w") as f: + json.dump(config, f) + + print("Starting benchmark tests...", challenge) + if challenge: + print(f"Running {challenge} challenges") + pytest.main(["agbenchmark", "-m", challenge, "-vs"]) + else: + print("Running all challenges") + pytest.main(["agbenchmark", "-vs"]) + + +if __name__ == "__main__": + start() diff --git a/agbenchmark/tests/basic_abilities/browse_test.py b/agbenchmark/tests/basic_abilities/browse_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/tests/basic_abilities/remember_context_test.py b/agbenchmark/tests/basic_abilities/remember_context_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py new file mode 100644 index 000000000..b05a7ac31 --- /dev/null +++ b/agbenchmark/utils.py @@ -0,0 +1 @@ +# radio charts, logs, helper functions for tests, anything else relevant. diff --git a/agbenchmark/workspace/__init__.py b/agbenchmark/workspace/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/workspace/cloud_services/aws.py b/agbenchmark/workspace/cloud_services/aws.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/workspace/workspace_manager.py b/agbenchmark/workspace/workspace_manager.py deleted file mode 100644 index cfcf3f7ac..000000000 --- a/agbenchmark/workspace/workspace_manager.py +++ /dev/null @@ -1 +0,0 @@ -# Manages the workspaces including creation, deletion, etc diff --git a/data/README.md b/data/README.md deleted file mode 100644 index d3e32b563..000000000 --- a/data/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Challenges Data Schema of Benchmark - -## General challenges -Input: -- **category** (str): information-retrieval -- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"] - - - -## Information-retrieval challenges -Input: -- **category** (str): information-retrieval -- **query** (str): the question need to be solve. -- **ground** (dict): The ground truth. - - **answer** (str): The raw text of ground truth answer - - **should_contain** (list): the exact strings that is required in the final answer - - **should_not_contain** (list): the exact strings that should not be in the final answer -- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"] - -Example: -```python -{ - "category": "information-retrieval", - "query": "what is the capital of America", - "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"] - }, - "difficulty_level": "easy" -} -``` - - - -Output: -- **score** (float): scores range from [0, 1] \ No newline at end of file diff --git a/data/retrieval/r1_test_data_0.json b/data/retrieval/r1_test_data_0.json deleted file mode 100644 index a64f7e0cc..000000000 --- a/data/retrieval/r1_test_data_0.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "category": "information-retrieval", - "query": "what is the capital of America", - "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"] - }, - "difficulty_level": "easy" -} \ No newline at end of file diff --git a/data/retrieval/r1_test_data_1.json b/data/retrieval/r1_test_data_1.json deleted file mode 100644 index 73dec4cdd..000000000 --- a/data/retrieval/r1_test_data_1.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "category": "information-retrieval", - "query": "The Nobel Prize in Literature 2012", - "ground": { - "answer": "Mo Yan", - "should_contain": ["Mo Yan"], - "should_not_contain": ["Murakami Haruki"] - }, - "difficulty_level": "easy" -} \ No newline at end of file diff --git a/examples/basic_gpt_agent.py b/examples/basic_gpt_agent.py deleted file mode 100644 index e2cc380c8..000000000 --- a/examples/basic_gpt_agent.py +++ /dev/null @@ -1,26 +0,0 @@ -import json -import openai -from agbenchmark.benchmark.challenges.retrieval.r1_test import RetrievelChallenge - - -def basic_gpt_agent(challenge_file): - challenge = RetrievelChallenge.from_json_file(challenge_file) - - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-0613", - messages=[{"role": "user", "content": challenge.agent_input}]) - answer = response["choices"][0]["message"]["content"] - - output_file = "./basic_gpt_agent_retrieval_results.txt" - with open(output_file, "w") as f: - f.write(answer) - - print("QUERY : ", challenge.agent_input) - print("AGENT ANSWER: ", answer) - - score = challenge.run(output_file) - - print("AGENT SCORE : ", score) - -if __name__ == "__main__": - basic_gpt_agent("./data/retrieval/r1_test_data_1.json") diff --git a/file_to_check.txt b/file_to_check.txt new file mode 100644 index 000000000..29afa8611 --- /dev/null +++ b/file_to_check.txt @@ -0,0 +1 @@ +The capital of America is Washington, D.C. \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 12a0390ef..3f1059aaf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,265 @@ # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +[[package]] +name = "aiohttp" +version = "3.8.4" +description = "Async http client/server framework (asyncio)" +optional = false +python-versions = ">=3.6" +files = [ + {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ce45967538fb747370308d3145aa68a074bdecb4f3a300869590f725ced69c1"}, + {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b744c33b6f14ca26b7544e8d8aadff6b765a80ad6164fb1a430bbadd593dfb1a"}, + {file = "aiohttp-3.8.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a45865451439eb320784918617ba54b7a377e3501fb70402ab84d38c2cd891b"}, + {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a86d42d7cba1cec432d47ab13b6637bee393a10f664c425ea7b305d1301ca1a3"}, + {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee3c36df21b5714d49fc4580247947aa64bcbe2939d1b77b4c8dcb8f6c9faecc"}, + {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:176a64b24c0935869d5bbc4c96e82f89f643bcdf08ec947701b9dbb3c956b7dd"}, + {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c844fd628851c0bc309f3c801b3a3d58ce430b2ce5b359cd918a5a76d0b20cb5"}, + {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5393fb786a9e23e4799fec788e7e735de18052f83682ce2dfcabaf1c00c2c08e"}, + {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e4b09863aae0dc965c3ef36500d891a3ff495a2ea9ae9171e4519963c12ceefd"}, + {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:adfbc22e87365a6e564c804c58fc44ff7727deea782d175c33602737b7feadb6"}, + {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:147ae376f14b55f4f3c2b118b95be50a369b89b38a971e80a17c3fd623f280c9"}, + {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:eafb3e874816ebe2a92f5e155f17260034c8c341dad1df25672fb710627c6949"}, + {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c6cc15d58053c76eacac5fa9152d7d84b8d67b3fde92709195cb984cfb3475ea"}, + {file = "aiohttp-3.8.4-cp310-cp310-win32.whl", hash = "sha256:59f029a5f6e2d679296db7bee982bb3d20c088e52a2977e3175faf31d6fb75d1"}, + {file = "aiohttp-3.8.4-cp310-cp310-win_amd64.whl", hash = "sha256:fe7ba4a51f33ab275515f66b0a236bcde4fb5561498fe8f898d4e549b2e4509f"}, + {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3d8ef1a630519a26d6760bc695842579cb09e373c5f227a21b67dc3eb16cfea4"}, + {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b3f2e06a512e94722886c0827bee9807c86a9f698fac6b3aee841fab49bbfb4"}, + {file = "aiohttp-3.8.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3a80464982d41b1fbfe3154e440ba4904b71c1a53e9cd584098cd41efdb188ef"}, + {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b631e26df63e52f7cce0cce6507b7a7f1bc9b0c501fcde69742130b32e8782f"}, + {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f43255086fe25e36fd5ed8f2ee47477408a73ef00e804cb2b5cba4bf2ac7f5e"}, + {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4d347a172f866cd1d93126d9b239fcbe682acb39b48ee0873c73c933dd23bd0f"}, + {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3fec6a4cb5551721cdd70473eb009d90935b4063acc5f40905d40ecfea23e05"}, + {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80a37fe8f7c1e6ce8f2d9c411676e4bc633a8462844e38f46156d07a7d401654"}, + {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d1e6a862b76f34395a985b3cd39a0d949ca80a70b6ebdea37d3ab39ceea6698a"}, + {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cd468460eefef601ece4428d3cf4562459157c0f6523db89365202c31b6daebb"}, + {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:618c901dd3aad4ace71dfa0f5e82e88b46ef57e3239fc7027773cb6d4ed53531"}, + {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:652b1bff4f15f6287550b4670546a2947f2a4575b6c6dff7760eafb22eacbf0b"}, + {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80575ba9377c5171407a06d0196b2310b679dc752d02a1fcaa2bc20b235dbf24"}, + {file = "aiohttp-3.8.4-cp311-cp311-win32.whl", hash = "sha256:bbcf1a76cf6f6dacf2c7f4d2ebd411438c275faa1dc0c68e46eb84eebd05dd7d"}, + {file = "aiohttp-3.8.4-cp311-cp311-win_amd64.whl", hash = "sha256:6e74dd54f7239fcffe07913ff8b964e28b712f09846e20de78676ce2a3dc0bfc"}, + {file = "aiohttp-3.8.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:880e15bb6dad90549b43f796b391cfffd7af373f4646784795e20d92606b7a51"}, + {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb96fa6b56bb536c42d6a4a87dfca570ff8e52de2d63cabebfd6fb67049c34b6"}, + {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a6cadebe132e90cefa77e45f2d2f1a4b2ce5c6b1bfc1656c1ddafcfe4ba8131"}, + {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f352b62b45dff37b55ddd7b9c0c8672c4dd2eb9c0f9c11d395075a84e2c40f75"}, + {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ab43061a0c81198d88f39aaf90dae9a7744620978f7ef3e3708339b8ed2ef01"}, + {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9cb1565a7ad52e096a6988e2ee0397f72fe056dadf75d17fa6b5aebaea05622"}, + {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:1b3ea7edd2d24538959c1c1abf97c744d879d4e541d38305f9bd7d9b10c9ec41"}, + {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:7c7837fe8037e96b6dd5cfcf47263c1620a9d332a87ec06a6ca4564e56bd0f36"}, + {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:3b90467ebc3d9fa5b0f9b6489dfb2c304a1db7b9946fa92aa76a831b9d587e99"}, + {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:cab9401de3ea52b4b4c6971db5fb5c999bd4260898af972bf23de1c6b5dd9d71"}, + {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d1f9282c5f2b5e241034a009779e7b2a1aa045f667ff521e7948ea9b56e0c5ff"}, + {file = "aiohttp-3.8.4-cp36-cp36m-win32.whl", hash = "sha256:5e14f25765a578a0a634d5f0cd1e2c3f53964553a00347998dfdf96b8137f777"}, + {file = "aiohttp-3.8.4-cp36-cp36m-win_amd64.whl", hash = "sha256:4c745b109057e7e5f1848c689ee4fb3a016c8d4d92da52b312f8a509f83aa05e"}, + {file = "aiohttp-3.8.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:aede4df4eeb926c8fa70de46c340a1bc2c6079e1c40ccf7b0eae1313ffd33519"}, + {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ddaae3f3d32fc2cb4c53fab020b69a05c8ab1f02e0e59665c6f7a0d3a5be54f"}, + {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4eb3b82ca349cf6fadcdc7abcc8b3a50ab74a62e9113ab7a8ebc268aad35bb9"}, + {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bcb89336efa095ea21b30f9e686763f2be4478f1b0a616969551982c4ee4c3b"}, + {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c08e8ed6fa3d477e501ec9db169bfac8140e830aa372d77e4a43084d8dd91ab"}, + {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c6cd05ea06daca6ad6a4ca3ba7fe7dc5b5de063ff4daec6170ec0f9979f6c332"}, + {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b7a00a9ed8d6e725b55ef98b1b35c88013245f35f68b1b12c5cd4100dddac333"}, + {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:de04b491d0e5007ee1b63a309956eaed959a49f5bb4e84b26c8f5d49de140fa9"}, + {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:40653609b3bf50611356e6b6554e3a331f6879fa7116f3959b20e3528783e699"}, + {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dbf3a08a06b3f433013c143ebd72c15cac33d2914b8ea4bea7ac2c23578815d6"}, + {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:854f422ac44af92bfe172d8e73229c270dc09b96535e8a548f99c84f82dde241"}, + {file = "aiohttp-3.8.4-cp37-cp37m-win32.whl", hash = "sha256:aeb29c84bb53a84b1a81c6c09d24cf33bb8432cc5c39979021cc0f98c1292a1a"}, + {file = "aiohttp-3.8.4-cp37-cp37m-win_amd64.whl", hash = "sha256:db3fc6120bce9f446d13b1b834ea5b15341ca9ff3f335e4a951a6ead31105480"}, + {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fabb87dd8850ef0f7fe2b366d44b77d7e6fa2ea87861ab3844da99291e81e60f"}, + {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:91f6d540163f90bbaef9387e65f18f73ffd7c79f5225ac3d3f61df7b0d01ad15"}, + {file = "aiohttp-3.8.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d265f09a75a79a788237d7f9054f929ced2e69eb0bb79de3798c468d8a90f945"}, + {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d89efa095ca7d442a6d0cbc755f9e08190ba40069b235c9886a8763b03785da"}, + {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4dac314662f4e2aa5009977b652d9b8db7121b46c38f2073bfeed9f4049732cd"}, + {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe11310ae1e4cd560035598c3f29d86cef39a83d244c7466f95c27ae04850f10"}, + {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ddb2a2026c3f6a68c3998a6c47ab6795e4127315d2e35a09997da21865757f8"}, + {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e75b89ac3bd27d2d043b234aa7b734c38ba1b0e43f07787130a0ecac1e12228a"}, + {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6e601588f2b502c93c30cd5a45bfc665faaf37bbe835b7cfd461753068232074"}, + {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a5d794d1ae64e7753e405ba58e08fcfa73e3fad93ef9b7e31112ef3c9a0efb52"}, + {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a1f4689c9a1462f3df0a1f7e797791cd6b124ddbee2b570d34e7f38ade0e2c71"}, + {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3032dcb1c35bc330134a5b8a5d4f68c1a87252dfc6e1262c65a7e30e62298275"}, + {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8189c56eb0ddbb95bfadb8f60ea1b22fcfa659396ea36f6adcc521213cd7b44d"}, + {file = "aiohttp-3.8.4-cp38-cp38-win32.whl", hash = "sha256:33587f26dcee66efb2fff3c177547bd0449ab7edf1b73a7f5dea1e38609a0c54"}, + {file = "aiohttp-3.8.4-cp38-cp38-win_amd64.whl", hash = "sha256:e595432ac259af2d4630008bf638873d69346372d38255774c0e286951e8b79f"}, + {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5a7bdf9e57126dc345b683c3632e8ba317c31d2a41acd5800c10640387d193ed"}, + {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:22f6eab15b6db242499a16de87939a342f5a950ad0abaf1532038e2ce7d31567"}, + {file = "aiohttp-3.8.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7235604476a76ef249bd64cb8274ed24ccf6995c4a8b51a237005ee7a57e8643"}, + {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea9eb976ffdd79d0e893869cfe179a8f60f152d42cb64622fca418cd9b18dc2a"}, + {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92c0cea74a2a81c4c76b62ea1cac163ecb20fb3ba3a75c909b9fa71b4ad493cf"}, + {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:493f5bc2f8307286b7799c6d899d388bbaa7dfa6c4caf4f97ef7521b9cb13719"}, + {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a63f03189a6fa7c900226e3ef5ba4d3bd047e18f445e69adbd65af433add5a2"}, + {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10c8cefcff98fd9168cdd86c4da8b84baaa90bf2da2269c6161984e6737bf23e"}, + {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bca5f24726e2919de94f047739d0a4fc01372801a3672708260546aa2601bf57"}, + {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:03baa76b730e4e15a45f81dfe29a8d910314143414e528737f8589ec60cf7391"}, + {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c29c77cc57e40f84acef9bfb904373a4e89a4e8b74e71aa8075c021ec9078c2"}, + {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:03543dcf98a6619254b409be2d22b51f21ec66272be4ebda7b04e6412e4b2e14"}, + {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:17b79c2963db82086229012cff93ea55196ed31f6493bb1ccd2c62f1724324e4"}, + {file = "aiohttp-3.8.4-cp39-cp39-win32.whl", hash = "sha256:34ce9f93a4a68d1272d26030655dd1b58ff727b3ed2a33d80ec433561b03d67a"}, + {file = "aiohttp-3.8.4-cp39-cp39-win_amd64.whl", hash = "sha256:41a86a69bb63bb2fc3dc9ad5ea9f10f1c9c8e282b471931be0268ddd09430b04"}, + {file = "aiohttp-3.8.4.tar.gz", hash = "sha256:bf2e1a9162c1e441bf805a1fd166e249d574ca04e03b34f97e2928769e91ab5c"}, +] + +[package.dependencies] +aiosignal = ">=1.1.2" +async-timeout = ">=4.0.0a3,<5.0" +attrs = ">=17.3.0" +charset-normalizer = ">=2.0,<4.0" +frozenlist = ">=1.1.1" +multidict = ">=4.5,<7.0" +yarl = ">=1.0,<2.0" + +[package.extras] +speedups = ["Brotli", "aiodns", "cchardet"] + +[[package]] +name = "aiosignal" +version = "1.3.1" +description = "aiosignal: a list of registered asynchronous callbacks" +optional = false +python-versions = ">=3.7" +files = [ + {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, + {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, +] + +[package.dependencies] +frozenlist = ">=1.1.0" + +[[package]] +name = "async-timeout" +version = "4.0.2" +description = "Timeout context manager for asyncio programs" +optional = false +python-versions = ">=3.6" +files = [ + {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"}, + {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, +] + +[[package]] +name = "attrs" +version = "23.1.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, + {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, +] + +[package.extras] +cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] +dev = ["attrs[docs,tests]", "pre-commit"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] +tests = ["attrs[tests-no-zope]", "zope-interface"] +tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] + +[[package]] +name = "certifi" +version = "2023.5.7" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, + {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.1.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"}, + {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"}, +] + +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -25,6 +285,100 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "frozenlist" +version = "1.3.3" +description = "A list-like structure which implements collections.abc.MutableSequence" +optional = false +python-versions = ">=3.7" +files = [ + {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4"}, + {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dfbac4c2dfcc082fcf8d942d1e49b6aa0766c19d3358bd86e2000bf0fa4a9cf0"}, + {file = "frozenlist-1.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b1c63e8d377d039ac769cd0926558bb7068a1f7abb0f003e3717ee003ad85530"}, + {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fdfc24dcfce5b48109867c13b4cb15e4660e7bd7661741a391f821f23dfdca7"}, + {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c926450857408e42f0bbc295e84395722ce74bae69a3b2aa2a65fe22cb14b99"}, + {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1841e200fdafc3d51f974d9d377c079a0694a8f06de2e67b48150328d66d5483"}, + {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f470c92737afa7d4c3aacc001e335062d582053d4dbe73cda126f2d7031068dd"}, + {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:783263a4eaad7c49983fe4b2e7b53fa9770c136c270d2d4bbb6d2192bf4d9caf"}, + {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:924620eef691990dfb56dc4709f280f40baee568c794b5c1885800c3ecc69816"}, + {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ae4dc05c465a08a866b7a1baf360747078b362e6a6dbeb0c57f234db0ef88ae0"}, + {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:bed331fe18f58d844d39ceb398b77d6ac0b010d571cba8267c2e7165806b00ce"}, + {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:02c9ac843e3390826a265e331105efeab489ffaf4dd86384595ee8ce6d35ae7f"}, + {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9545a33965d0d377b0bc823dcabf26980e77f1b6a7caa368a365a9497fb09420"}, + {file = "frozenlist-1.3.3-cp310-cp310-win32.whl", hash = "sha256:d5cd3ab21acbdb414bb6c31958d7b06b85eeb40f66463c264a9b343a4e238642"}, + {file = "frozenlist-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b756072364347cb6aa5b60f9bc18e94b2f79632de3b0190253ad770c5df17db1"}, + {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b4395e2f8d83fbe0c627b2b696acce67868793d7d9750e90e39592b3626691b7"}, + {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14143ae966a6229350021384870458e4777d1eae4c28d1a7aa47f24d030e6678"}, + {file = "frozenlist-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d8860749e813a6f65bad8285a0520607c9500caa23fea6ee407e63debcdbef6"}, + {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8"}, + {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb82dbba47a8318e75f679690190c10a5e1f447fbf9df41cbc4c3afd726d88cb"}, + {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9309869032abb23d196cb4e4db574232abe8b8be1339026f489eeb34a4acfd91"}, + {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a97b4fe50b5890d36300820abd305694cb865ddb7885049587a5678215782a6b"}, + {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c188512b43542b1e91cadc3c6c915a82a5eb95929134faf7fd109f14f9892ce4"}, + {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:303e04d422e9b911a09ad499b0368dc551e8c3cd15293c99160c7f1f07b59a48"}, + {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0771aed7f596c7d73444c847a1c16288937ef988dc04fb9f7be4b2aa91db609d"}, + {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:66080ec69883597e4d026f2f71a231a1ee9887835902dbe6b6467d5a89216cf6"}, + {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:41fe21dc74ad3a779c3d73a2786bdf622ea81234bdd4faf90b8b03cad0c2c0b4"}, + {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f20380df709d91525e4bee04746ba612a4df0972c1b8f8e1e8af997e678c7b81"}, + {file = "frozenlist-1.3.3-cp311-cp311-win32.whl", hash = "sha256:f30f1928162e189091cf4d9da2eac617bfe78ef907a761614ff577ef4edfb3c8"}, + {file = "frozenlist-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:a6394d7dadd3cfe3f4b3b186e54d5d8504d44f2d58dcc89d693698e8b7132b32"}, + {file = "frozenlist-1.3.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8df3de3a9ab8325f94f646609a66cbeeede263910c5c0de0101079ad541af332"}, + {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0693c609e9742c66ba4870bcee1ad5ff35462d5ffec18710b4ac89337ff16e27"}, + {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd4210baef299717db0a600d7a3cac81d46ef0e007f88c9335db79f8979c0d3d"}, + {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:394c9c242113bfb4b9aa36e2b80a05ffa163a30691c7b5a29eba82e937895d5e"}, + {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6327eb8e419f7d9c38f333cde41b9ae348bec26d840927332f17e887a8dcb70d"}, + {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e24900aa13212e75e5b366cb9065e78bbf3893d4baab6052d1aca10d46d944c"}, + {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3843f84a6c465a36559161e6c59dce2f2ac10943040c2fd021cfb70d58c4ad56"}, + {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:84610c1502b2461255b4c9b7d5e9c48052601a8957cd0aea6ec7a7a1e1fb9420"}, + {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c21b9aa40e08e4f63a2f92ff3748e6b6c84d717d033c7b3438dd3123ee18f70e"}, + {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:efce6ae830831ab6a22b9b4091d411698145cb9b8fc869e1397ccf4b4b6455cb"}, + {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:40de71985e9042ca00b7953c4f41eabc3dc514a2d1ff534027f091bc74416401"}, + {file = "frozenlist-1.3.3-cp37-cp37m-win32.whl", hash = "sha256:180c00c66bde6146a860cbb81b54ee0df350d2daf13ca85b275123bbf85de18a"}, + {file = "frozenlist-1.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9bbbcedd75acdfecf2159663b87f1bb5cfc80e7cd99f7ddd9d66eb98b14a8411"}, + {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:034a5c08d36649591be1cbb10e09da9f531034acfe29275fc5454a3b101ce41a"}, + {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ba64dc2b3b7b158c6660d49cdb1d872d1d0bf4e42043ad8d5006099479a194e5"}, + {file = "frozenlist-1.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47df36a9fe24054b950bbc2db630d508cca3aa27ed0566c0baf661225e52c18e"}, + {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:008a054b75d77c995ea26629ab3a0c0d7281341f2fa7e1e85fa6153ae29ae99c"}, + {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:841ea19b43d438a80b4de62ac6ab21cfe6827bb8a9dc62b896acc88eaf9cecba"}, + {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e235688f42b36be2b6b06fc37ac2126a73b75fb8d6bc66dd632aa35286238703"}, + {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca713d4af15bae6e5d79b15c10c8522859a9a89d3b361a50b817c98c2fb402a2"}, + {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ac5995f2b408017b0be26d4a1d7c61bce106ff3d9e3324374d66b5964325448"}, + {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4ae8135b11652b08a8baf07631d3ebfe65a4c87909dbef5fa0cdde440444ee4"}, + {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4ea42116ceb6bb16dbb7d526e242cb6747b08b7710d9782aa3d6732bd8d27649"}, + {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:810860bb4bdce7557bc0febb84bbd88198b9dbc2022d8eebe5b3590b2ad6c842"}, + {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ee78feb9d293c323b59a6f2dd441b63339a30edf35abcb51187d2fc26e696d13"}, + {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0af2e7c87d35b38732e810befb9d797a99279cbb85374d42ea61c1e9d23094b3"}, + {file = "frozenlist-1.3.3-cp38-cp38-win32.whl", hash = "sha256:899c5e1928eec13fd6f6d8dc51be23f0d09c5281e40d9cf4273d188d9feeaf9b"}, + {file = "frozenlist-1.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:7f44e24fa70f6fbc74aeec3e971f60a14dde85da364aa87f15d1be94ae75aeef"}, + {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2b07ae0c1edaa0a36339ec6cce700f51b14a3fc6545fdd32930d2c83917332cf"}, + {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ebb86518203e12e96af765ee89034a1dbb0c3c65052d1b0c19bbbd6af8a145e1"}, + {file = "frozenlist-1.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cf820485f1b4c91e0417ea0afd41ce5cf5965011b3c22c400f6d144296ccbc0"}, + {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c11e43016b9024240212d2a65043b70ed8dfd3b52678a1271972702d990ac6d"}, + {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8fa3c6e3305aa1146b59a09b32b2e04074945ffcfb2f0931836d103a2c38f936"}, + {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:352bd4c8c72d508778cf05ab491f6ef36149f4d0cb3c56b1b4302852255d05d5"}, + {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65a5e4d3aa679610ac6e3569e865425b23b372277f89b5ef06cf2cdaf1ebf22b"}, + {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e2c1185858d7e10ff045c496bbf90ae752c28b365fef2c09cf0fa309291669"}, + {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f163d2fd041c630fed01bc48d28c3ed4a3b003c00acd396900e11ee5316b56bb"}, + {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:05cdb16d09a0832eedf770cb7bd1fe57d8cf4eaf5aced29c4e41e3f20b30a784"}, + {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8bae29d60768bfa8fb92244b74502b18fae55a80eac13c88eb0b496d4268fd2d"}, + {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eedab4c310c0299961ac285591acd53dc6723a1ebd90a57207c71f6e0c2153ab"}, + {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3bbdf44855ed8f0fbcd102ef05ec3012d6a4fd7c7562403f76ce6a52aeffb2b1"}, + {file = "frozenlist-1.3.3-cp39-cp39-win32.whl", hash = "sha256:efa568b885bca461f7c7b9e032655c0c143d305bf01c30caf6db2854a4532b38"}, + {file = "frozenlist-1.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:cfe33efc9cb900a4c46f91a5ceba26d6df370ffddd9ca386eb1d4f0ad97b9ea9"}, + {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, +] + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -36,6 +390,111 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "multidict" +version = "6.0.4" +description = "multidict implementation" +optional = false +python-versions = ">=3.7" +files = [ + {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"}, + {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"}, + {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"}, + {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"}, + {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"}, + {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"}, + {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"}, + {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"}, + {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"}, + {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"}, + {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"}, + {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"}, + {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"}, + {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"}, + {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"}, + {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"}, + {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"}, + {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"}, + {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"}, + {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"}, + {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"}, + {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"}, + {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"}, + {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"}, + {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"}, + {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"}, + {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"}, + {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"}, + {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"}, + {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"}, + {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"}, + {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"}, + {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"}, + {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, +] + +[[package]] +name = "openai" +version = "0.27.8" +description = "Python client library for the OpenAI API" +optional = false +python-versions = ">=3.7.1" +files = [ + {file = "openai-0.27.8-py3-none-any.whl", hash = "sha256:e0a7c2f7da26bdbe5354b03c6d4b82a2f34bd4458c7a17ae1a7092c3e397e03c"}, + {file = "openai-0.27.8.tar.gz", hash = "sha256:2483095c7db1eee274cebac79e315a986c4e55207bb4fa7b82d185b3a2ed9536"}, +] + +[package.dependencies] +aiohttp = "*" +requests = ">=2.20" +tqdm = "*" + +[package.extras] +datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"] +embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"] +wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"] + [[package]] name = "packaging" version = "23.1" @@ -62,6 +521,58 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "pydantic" +version = "1.10.9" +description = "Data validation and settings management using python type hints" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pydantic-1.10.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e692dec4a40bfb40ca530e07805b1208c1de071a18d26af4a2a0d79015b352ca"}, + {file = "pydantic-1.10.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c52eb595db83e189419bf337b59154bdcca642ee4b2a09e5d7797e41ace783f"}, + {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939328fd539b8d0edf244327398a667b6b140afd3bf7e347cf9813c736211896"}, + {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b48d3d634bca23b172f47f2335c617d3fcb4b3ba18481c96b7943a4c634f5c8d"}, + {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f0b7628fb8efe60fe66fd4adadd7ad2304014770cdc1f4934db41fe46cc8825f"}, + {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e1aa5c2410769ca28aa9a7841b80d9d9a1c5f223928ca8bec7e7c9a34d26b1d4"}, + {file = "pydantic-1.10.9-cp310-cp310-win_amd64.whl", hash = "sha256:eec39224b2b2e861259d6f3c8b6290d4e0fbdce147adb797484a42278a1a486f"}, + {file = "pydantic-1.10.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d111a21bbbfd85c17248130deac02bbd9b5e20b303338e0dbe0faa78330e37e0"}, + {file = "pydantic-1.10.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e9aec8627a1a6823fc62fb96480abe3eb10168fd0d859ee3d3b395105ae19a7"}, + {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07293ab08e7b4d3c9d7de4949a0ea571f11e4557d19ea24dd3ae0c524c0c334d"}, + {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee829b86ce984261d99ff2fd6e88f2230068d96c2a582f29583ed602ef3fc2c"}, + {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4b466a23009ff5cdd7076eb56aca537c745ca491293cc38e72bf1e0e00de5b91"}, + {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7847ca62e581e6088d9000f3c497267868ca2fa89432714e21a4fb33a04d52e8"}, + {file = "pydantic-1.10.9-cp311-cp311-win_amd64.whl", hash = "sha256:7845b31959468bc5b78d7b95ec52fe5be32b55d0d09983a877cca6aedc51068f"}, + {file = "pydantic-1.10.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:517a681919bf880ce1dac7e5bc0c3af1e58ba118fd774da2ffcd93c5f96eaece"}, + {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67195274fd27780f15c4c372f4ba9a5c02dad6d50647b917b6a92bf00b3d301a"}, + {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2196c06484da2b3fded1ab6dbe182bdabeb09f6318b7fdc412609ee2b564c49a"}, + {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6257bb45ad78abacda13f15bde5886efd6bf549dd71085e64b8dcf9919c38b60"}, + {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3283b574b01e8dbc982080d8287c968489d25329a463b29a90d4157de4f2baaf"}, + {file = "pydantic-1.10.9-cp37-cp37m-win_amd64.whl", hash = "sha256:5f8bbaf4013b9a50e8100333cc4e3fa2f81214033e05ac5aa44fa24a98670a29"}, + {file = "pydantic-1.10.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9cd67fb763248cbe38f0593cd8611bfe4b8ad82acb3bdf2b0898c23415a1f82"}, + {file = "pydantic-1.10.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f50e1764ce9353be67267e7fd0da08349397c7db17a562ad036aa7c8f4adfdb6"}, + {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73ef93e5e1d3c8e83f1ff2e7fdd026d9e063c7e089394869a6e2985696693766"}, + {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128d9453d92e6e81e881dd7e2484e08d8b164da5507f62d06ceecf84bf2e21d3"}, + {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ad428e92ab68798d9326bb3e5515bc927444a3d71a93b4a2ca02a8a5d795c572"}, + {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fab81a92f42d6d525dd47ced310b0c3e10c416bbfae5d59523e63ea22f82b31e"}, + {file = "pydantic-1.10.9-cp38-cp38-win_amd64.whl", hash = "sha256:963671eda0b6ba6926d8fc759e3e10335e1dc1b71ff2a43ed2efd6996634dafb"}, + {file = "pydantic-1.10.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:970b1bdc6243ef663ba5c7e36ac9ab1f2bfecb8ad297c9824b542d41a750b298"}, + {file = "pydantic-1.10.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7e1d5290044f620f80cf1c969c542a5468f3656de47b41aa78100c5baa2b8276"}, + {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83fcff3c7df7adff880622a98022626f4f6dbce6639a88a15a3ce0f96466cb60"}, + {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0da48717dc9495d3a8f215e0d012599db6b8092db02acac5e0d58a65248ec5bc"}, + {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0a2aabdc73c2a5960e87c3ffebca6ccde88665616d1fd6d3db3178ef427b267a"}, + {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9863b9420d99dfa9c064042304868e8ba08e89081428a1c471858aa2af6f57c4"}, + {file = "pydantic-1.10.9-cp39-cp39-win_amd64.whl", hash = "sha256:e7c9900b43ac14110efa977be3da28931ffc74c27e96ee89fbcaaf0b0fe338e1"}, + {file = "pydantic-1.10.9-py3-none-any.whl", hash = "sha256:6cafde02f6699ce4ff643417d1a9223716ec25e228ddc3b436fe7e2d25a1f305"}, + {file = "pydantic-1.10.9.tar.gz", hash = "sha256:95c70da2cd3b6ddf3b9645ecaa8d98f3d80c606624b6d245558d202cd23ea3be"}, +] + +[package.dependencies] +typing-extensions = ">=4.2.0" + +[package.extras] +dotenv = ["python-dotenv (>=0.10.4)"] +email = ["email-validator (>=1.0.3)"] + [[package]] name = "pytest" version = "7.3.2" @@ -84,6 +595,27 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "requests" +version = "2.31.0" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, + {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + [[package]] name = "tomli" version = "2.0.1" @@ -95,7 +627,142 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "tqdm" +version = "4.65.0" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, + {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + +[[package]] +name = "typing-extensions" +version = "4.6.3" +description = "Backported and Experimental Type Hints for Python 3.7+" +optional = false +python-versions = ">=3.7" +files = [ + {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"}, + {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, +] + +[[package]] +name = "urllib3" +version = "2.0.3" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.7" +files = [ + {file = "urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"}, + {file = "urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "yarl" +version = "1.9.2" +description = "Yet another URL library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"}, + {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"}, + {file = "yarl-1.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9"}, + {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee4afac41415d52d53a9833ebae7e32b344be72835bbb589018c9e938045a560"}, + {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bf345c3a4f5ba7f766430f97f9cc1320786f19584acc7086491f45524a551ac"}, + {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a96c19c52ff442a808c105901d0bdfd2e28575b3d5f82e2f5fd67e20dc5f4ea"}, + {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:891c0e3ec5ec881541f6c5113d8df0315ce5440e244a716b95f2525b7b9f3608"}, + {file = "yarl-1.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c3a53ba34a636a256d767c086ceb111358876e1fb6b50dfc4d3f4951d40133d5"}, + {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:566185e8ebc0898b11f8026447eacd02e46226716229cea8db37496c8cdd26e0"}, + {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2b0738fb871812722a0ac2154be1f049c6223b9f6f22eec352996b69775b36d4"}, + {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:32f1d071b3f362c80f1a7d322bfd7b2d11e33d2adf395cc1dd4df36c9c243095"}, + {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e9fdc7ac0d42bc3ea78818557fab03af6181e076a2944f43c38684b4b6bed8e3"}, + {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56ff08ab5df8429901ebdc5d15941b59f6253393cb5da07b4170beefcf1b2528"}, + {file = "yarl-1.9.2-cp310-cp310-win32.whl", hash = "sha256:8ea48e0a2f931064469bdabca50c2f578b565fc446f302a79ba6cc0ee7f384d3"}, + {file = "yarl-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:50f33040f3836e912ed16d212f6cc1efb3231a8a60526a407aeb66c1c1956dde"}, + {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:646d663eb2232d7909e6601f1a9107e66f9791f290a1b3dc7057818fe44fc2b6"}, + {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aff634b15beff8902d1f918012fc2a42e0dbae6f469fce134c8a0dc51ca423bb"}, + {file = "yarl-1.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a83503934c6273806aed765035716216cc9ab4e0364f7f066227e1aaea90b8d0"}, + {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b25322201585c69abc7b0e89e72790469f7dad90d26754717f3310bfe30331c2"}, + {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22a94666751778629f1ec4280b08eb11815783c63f52092a5953faf73be24191"}, + {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ec53a0ea2a80c5cd1ab397925f94bff59222aa3cf9c6da938ce05c9ec20428d"}, + {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:159d81f22d7a43e6eabc36d7194cb53f2f15f498dbbfa8edc8a3239350f59fe7"}, + {file = "yarl-1.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:832b7e711027c114d79dffb92576acd1bd2decc467dec60e1cac96912602d0e6"}, + {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:95d2ecefbcf4e744ea952d073c6922e72ee650ffc79028eb1e320e732898d7e8"}, + {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d4e2c6d555e77b37288eaf45b8f60f0737c9efa3452c6c44626a5455aeb250b9"}, + {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:783185c75c12a017cc345015ea359cc801c3b29a2966c2655cd12b233bf5a2be"}, + {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:b8cc1863402472f16c600e3e93d542b7e7542a540f95c30afd472e8e549fc3f7"}, + {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:822b30a0f22e588b32d3120f6d41e4ed021806418b4c9f0bc3048b8c8cb3f92a"}, + {file = "yarl-1.9.2-cp311-cp311-win32.whl", hash = "sha256:a60347f234c2212a9f0361955007fcf4033a75bf600a33c88a0a8e91af77c0e8"}, + {file = "yarl-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:be6b3fdec5c62f2a67cb3f8c6dbf56bbf3f61c0f046f84645cd1ca73532ea051"}, + {file = "yarl-1.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38a3928ae37558bc1b559f67410df446d1fbfa87318b124bf5032c31e3447b74"}, + {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac9bb4c5ce3975aeac288cfcb5061ce60e0d14d92209e780c93954076c7c4367"}, + {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3da8a678ca8b96c8606bbb8bfacd99a12ad5dd288bc6f7979baddd62f71c63ef"}, + {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13414591ff516e04fcdee8dc051c13fd3db13b673c7a4cb1350e6b2ad9639ad3"}, + {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf74d08542c3a9ea97bb8f343d4fcbd4d8f91bba5ec9d5d7f792dbe727f88938"}, + {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e7221580dc1db478464cfeef9b03b95c5852cc22894e418562997df0d074ccc"}, + {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:494053246b119b041960ddcd20fd76224149cfea8ed8777b687358727911dd33"}, + {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:52a25809fcbecfc63ac9ba0c0fb586f90837f5425edfd1ec9f3372b119585e45"}, + {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e65610c5792870d45d7b68c677681376fcf9cc1c289f23e8e8b39c1485384185"}, + {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:1b1bba902cba32cdec51fca038fd53f8beee88b77efc373968d1ed021024cc04"}, + {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:662e6016409828ee910f5d9602a2729a8a57d74b163c89a837de3fea050c7582"}, + {file = "yarl-1.9.2-cp37-cp37m-win32.whl", hash = "sha256:f364d3480bffd3aa566e886587eaca7c8c04d74f6e8933f3f2c996b7f09bee1b"}, + {file = "yarl-1.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6a5883464143ab3ae9ba68daae8e7c5c95b969462bbe42e2464d60e7e2698368"}, + {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5610f80cf43b6202e2c33ba3ec2ee0a2884f8f423c8f4f62906731d876ef4fac"}, + {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9a4e67ad7b646cd6f0938c7ebfd60e481b7410f574c560e455e938d2da8e0f4"}, + {file = "yarl-1.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83fcc480d7549ccebe9415d96d9263e2d4226798c37ebd18c930fce43dfb9574"}, + {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fcd436ea16fee7d4207c045b1e340020e58a2597301cfbcfdbe5abd2356c2fb"}, + {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84e0b1599334b1e1478db01b756e55937d4614f8654311eb26012091be109d59"}, + {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3458a24e4ea3fd8930e934c129b676c27452e4ebda80fbe47b56d8c6c7a63a9e"}, + {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:838162460b3a08987546e881a2bfa573960bb559dfa739e7800ceeec92e64417"}, + {file = "yarl-1.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4e2d08f07a3d7d3e12549052eb5ad3eab1c349c53ac51c209a0e5991bbada78"}, + {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:de119f56f3c5f0e2fb4dee508531a32b069a5f2c6e827b272d1e0ff5ac040333"}, + {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:149ddea5abf329752ea5051b61bd6c1d979e13fbf122d3a1f9f0c8be6cb6f63c"}, + {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:674ca19cbee4a82c9f54e0d1eee28116e63bc6fd1e96c43031d11cbab8b2afd5"}, + {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:9b3152f2f5677b997ae6c804b73da05a39daa6a9e85a512e0e6823d81cdad7cc"}, + {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5415d5a4b080dc9612b1b63cba008db84e908b95848369aa1da3686ae27b6d2b"}, + {file = "yarl-1.9.2-cp38-cp38-win32.whl", hash = "sha256:f7a3d8146575e08c29ed1cd287068e6d02f1c7bdff8970db96683b9591b86ee7"}, + {file = "yarl-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:63c48f6cef34e6319a74c727376e95626f84ea091f92c0250a98e53e62c77c72"}, + {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:75df5ef94c3fdc393c6b19d80e6ef1ecc9ae2f4263c09cacb178d871c02a5ba9"}, + {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c027a6e96ef77d401d8d5a5c8d6bc478e8042f1e448272e8d9752cb0aff8b5c8"}, + {file = "yarl-1.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3b078dbe227f79be488ffcfc7a9edb3409d018e0952cf13f15fd6512847f3f7"}, + {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59723a029760079b7d991a401386390c4be5bfec1e7dd83e25a6a0881859e716"}, + {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b03917871bf859a81ccb180c9a2e6c1e04d2f6a51d953e6a5cdd70c93d4e5a2a"}, + {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1012fa63eb6c032f3ce5d2171c267992ae0c00b9e164efe4d73db818465fac3"}, + {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74dcbfe780e62f4b5a062714576f16c2f3493a0394e555ab141bf0d746bb955"}, + {file = "yarl-1.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c56986609b057b4839968ba901944af91b8e92f1725d1a2d77cbac6972b9ed1"}, + {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2c315df3293cd521033533d242d15eab26583360b58f7ee5d9565f15fee1bef4"}, + {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b7232f8dfbd225d57340e441d8caf8652a6acd06b389ea2d3222b8bc89cbfca6"}, + {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:53338749febd28935d55b41bf0bcc79d634881195a39f6b2f767870b72514caf"}, + {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3"}, + {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8288d7cd28f8119b07dd49b7230d6b4562f9b61ee9a4ab02221060d21136be80"}, + {file = "yarl-1.9.2-cp39-cp39-win32.whl", hash = "sha256:b124e2a6d223b65ba8768d5706d103280914d61f5cae3afbc50fc3dfcc016623"}, + {file = "yarl-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18"}, + {file = "yarl-1.9.2.tar.gz", hash = "sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571"}, +] + +[package.dependencies] +idna = ">=2.0" +multidict = ">=4.0" + [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c5b989915c413ab901c39dd0c4f3b0fe203558c2879952a2460a52bda4f3e857" +content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38" diff --git a/pyproject.toml b/pyproject.toml index 2c099a5b8..f88821cf2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,10 +5,15 @@ description = "Benchmarking the performance of agents far and wide, regardless o authors = ["Silen Naihin "] license = "MIT" readme = "README.md" +packages = [{include = "agbenchmark"}] [tool.poetry.dependencies] python = "^3.9" pytest = "^7.3.2" +click = "^8.1.3" +requests = "^2.31.0" +openai = "^0.27.8" +pydantic = "^1.10.9" [build-system] @@ -19,5 +24,11 @@ build-backend = "poetry.core.masonry.api" minversion = "6.0" addopts = "-ra -q" testpaths = [ - "tests", "benchmark/challenges", + "tests", "agbenchmark", ] +markers = [ + "retrieval", +] + +[tool.poetry.scripts] +agbenchmark = "agbenchmark.start_benchmark:cli" diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/test_api.py b/tests/test_api.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/tests/test_workspace_manager.py b/tests/test_workspace_manager.py deleted file mode 100644 index e69de29bb..000000000 -- cgit v1.2.3 From e5974ca3ea5e3c781f12b66805dea5f1db15d75c Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Wed, 21 Jun 2023 11:44:59 -0400 Subject: Delete file_to_check.txt --- file_to_check.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 file_to_check.txt diff --git a/file_to_check.txt b/file_to_check.txt deleted file mode 100644 index 29afa8611..000000000 --- a/file_to_check.txt +++ /dev/null @@ -1 +0,0 @@ -The capital of America is Washington, D.C. \ No newline at end of file -- cgit v1.2.3 From 15c5469bb1aabf291864b5ba11981948b7b64fb2 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 22 Jun 2023 08:18:22 -0400 Subject: Add automatic regression markers (#38) --- README.md | 2 +- agbenchmark/conftest.py | 33 ++++++++++++ agbenchmark/start_benchmark.py | 64 +++++++++++++++-------- agbenchmark/tests/regression/RegressionManager.py | 22 ++++++++ agbenchmark/tests/regression/regression_tests.txt | 0 pyproject.toml | 1 + 6 files changed, 99 insertions(+), 23 deletions(-) create mode 100644 agbenchmark/tests/regression/RegressionManager.py create mode 100644 agbenchmark/tests/regression/regression_tests.txt diff --git a/README.md b/README.md index 216f1202c..b46562d2d 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Share your progress :) 1. `pip install auto-gpt-benchmarks` 2. Add boilerplate code to start webserver to your agent (run loop and stop condition) -3. `agbenchmark start --challenge challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory +3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory 4. We call the server to run the agent for each test 5. Show pass rate of tests, logs, and any other metrics diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index b3ca086d8..55f5ca82d 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -3,6 +3,7 @@ import os import pytest import shutil from agbenchmark.mocks.tests.retrieval_manual import mock_retrieval +from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests @@ -41,3 +42,35 @@ def server_response(request, config): # response.status_code == 200 # ), f"Request failed with status code {response.status_code}" mock_retrieval(task, config["workspace"]) + + +regression_txt = "agbenchmark/tests/regression/regression_tests.txt" + +regression_manager = RegressionManager(regression_txt) + + +def pytest_runtest_makereport(item, call): + """Called for each test report. Generated for each stage + of a test run (setup, call, teardown).""" + if call.when == "call": + if ( + call.excinfo is None + ): # if no error in the call stage, add it as a regression test + regression_manager.add_test(item.nodeid) + else: # otherwise, :( + regression_manager.remove_test(item.nodeid) + + +def pytest_collection_modifyitems(items): + """Called once all test items are collected. Used + to add regression marker to collected test items.""" + for item in items: + print("pytest_collection_modifyitems", item.nodeid) + if item.nodeid + "\n" in regression_manager.tests: + print(regression_manager.tests) + item.add_marker(pytest.mark.regression) + + +def pytest_sessionfinish(): + """Called at the end of the session to save regression tests""" + regression_manager.save() diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 79f308435..b7a116ebc 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -10,38 +10,58 @@ def cli(): @cli.command() -@click.option("--challenge", default=None, help="Specific challenge to run") -def start(challenge): - """Start the benchmark tests. If a challenge flag is is provided, run the challenges with that mark.""" - with open("agbenchmark/config.json", "r") as f: - config = json.load(f) +@click.option("--category", default=None, help="Specific category to run") +@click.option("--noreg", is_flag=True, help="Skip regression tests") +def start(category, noreg): + """Start the benchmark tests. If a category flag is is provided, run the categories with that mark.""" + """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" + config_file = "agbenchmark/config.json" - print("Current configuration:") - for key, value in config.items(): - print(f"{key}: {value}") + # Check if configuration file exists and is not empty + if not os.path.exists(config_file) or os.stat(config_file).st_size == 0: + config = {} - update_config = click.confirm( - "\nDo you want to update these parameters?", default=False - ) - if update_config: config["hostname"] = click.prompt( - "\nPlease enter a new hostname", default=config["hostname"] + "\nPlease enter a new hostname", default="localhost" ) - config["port"] = click.prompt("Please enter a new port", default=config["port"]) + config["port"] = click.prompt("Please enter a new port", default=8080) config["workspace"] = click.prompt( - "Please enter a new workspace path", default=config["workspace"] + "Please enter a new workspace path", default="/path/to/workspace" ) - with open("agbenchmark/config.json", "w") as f: + with open(config_file, "w") as f: json.dump(config, f) + else: + # If the configuration file exists and is not empty, load it + with open(config_file, "r") as f: + config = json.load(f) - print("Starting benchmark tests...", challenge) - if challenge: - print(f"Running {challenge} challenges") - pytest.main(["agbenchmark", "-m", challenge, "-vs"]) + print("Current configuration:") + for key, value in config.items(): + print(f"{key}: {value}") + + print("Starting benchmark tests...", category) + pytest_args = ["agbenchmark", "-vs"] + if category: + pytest_args.extend( + ["-m", category] + ) # run categorys that are of a specific marker + if noreg: + pytest_args.extend( + ["-k", "not regression"] + ) # run categorys that are of a specific marker but don't include regression categorys + print(f"Running {'non-regression' + category if noreg else category} categorys") else: - print("Running all challenges") - pytest.main(["agbenchmark", "-vs"]) + if noreg: + print("Running all non-regression categorys") + pytest_args.extend( + ["-k", "not regression"] + ) # run categorys that are not regression categorys + else: + print("Running all categorys") # run all categorys + + # Run pytest with the constructed arguments + pytest.main(pytest_args) if __name__ == "__main__": diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py new file mode 100644 index 000000000..9117d53f1 --- /dev/null +++ b/agbenchmark/tests/regression/RegressionManager.py @@ -0,0 +1,22 @@ +class RegressionManager: + """Abstracts interaction with the regression tests file""" + + def __init__(self, filename: str): + self.filename = filename + self.load() + + def load(self) -> None: + with open(self.filename, "r") as f: + self.tests = f.readlines() + + def save(self) -> None: + with open(self.filename, "w") as f: + f.writelines(self.tests) + + def add_test(self, test_id) -> None: + if f"{test_id}\n" not in self.tests: + self.tests.append(f"{test_id}\n") + + def remove_test(self, test_id) -> None: + if f"{test_id}\n" in self.tests: + self.tests.remove(f"{test_id}\n") diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt new file mode 100644 index 000000000..e69de29bb diff --git a/pyproject.toml b/pyproject.toml index f88821cf2..5498381a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ testpaths = [ ] markers = [ "retrieval", + "regression" ] [tool.poetry.scripts] -- cgit v1.2.3 From ffd1d15a0e32d608304f4e356eff2fbc306b3007 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Fri, 23 Jun 2023 07:53:57 -0400 Subject: MockManager, mock_func in data.json (#39) --- agbenchmark/challenges/README.md | 3 +- agbenchmark/challenges/define_task_types.py | 1 + agbenchmark/challenges/retrieval/r1/r1_data.json | 3 +- agbenchmark/challenges/retrieval/r1/r1_test.py | 2 +- agbenchmark/conftest.py | 40 ++++++++++++++++++------ agbenchmark/mocks/MockManager.py | 28 +++++++++++++++++ agbenchmark/mocks/tests/basic_mocks.py | 0 agbenchmark/mocks/tests/retrieval_manual.py | 10 ------ agbenchmark/mocks/tests/retrieval_mocks.py | 13 ++++++++ 9 files changed, 77 insertions(+), 23 deletions(-) create mode 100644 agbenchmark/mocks/MockManager.py create mode 100644 agbenchmark/mocks/tests/basic_mocks.py delete mode 100644 agbenchmark/mocks/tests/retrieval_manual.py create mode 100644 agbenchmark/mocks/tests/retrieval_mocks.py diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 40281c99b..50efe2c4d 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -17,8 +17,9 @@ Input: - **answer** (str): The raw text of ground truth answer - **should_contain** (list): the exact strings that is required in the final answer - **should_not_contain** (list): the exact strings that should not be in the final answer - - **files**: files that the are used for retrieval + - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt - **difficulty**(str): the difficulty of this query. choices from +- **mock_func**: function to mock the agent's response. This is used for testing purposes Example: diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 94b5ba533..f1a841b53 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -16,6 +16,7 @@ class Challenge(BaseModel): task: str ground: Ground difficulty: str + mock_func: Optional[str] = None def serialize(self, path: str) -> None: with open(path, "w") as file: diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index b5d5701ea..c7cc31004 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -7,5 +7,6 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy" + "difficulty": "easy", + "mock_func": "retrieval_1_mock" } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 195de15f8..e20c9f7b9 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -11,7 +11,7 @@ class TestRetrieval1(RetrievalChallenge): @pytest.mark.parametrize( "server_response", - [data.task], + [(data.task, data.mock_func)], indirect=True, ) @pytest.mark.retrieval diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 55f5ca82d..908d39e89 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -2,9 +2,10 @@ import json import os import pytest import shutil -from agbenchmark.mocks.tests.retrieval_manual import mock_retrieval from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests +from requests.exceptions import RequestException +from agbenchmark.mocks.MockManager import MockManager @pytest.fixture(scope="module") @@ -33,15 +34,34 @@ def workspace(config): @pytest.fixture(autouse=True) def server_response(request, config): - task = request.param # The task is passed in indirectly - print(f"Server starting at {request.module}") - # response = requests.post( - # f"{config['hostname']}:{config['port']}", data={"task": task} - # ) - # assert ( - # response.status_code == 200 - # ), f"Request failed with status code {response.status_code}" - mock_retrieval(task, config["workspace"]) + """Calling to get a response""" + if isinstance(request.param, tuple): + task = request.param[0] # The task is passed in indirectly + mock_function_name = request.param[1] + else: + task = request.param + mock_function_name = None + # print(f"Server starting at {request.module}") + # try: + # response = requests.post( + # f"{config['hostname']}:{config['port']}", data={"task": task} + # ) + # response.raise_for_status() # This will raise an HTTPError if the status is 4xx or 5xx + # except RequestException: + # # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock + + if mock_function_name: + mock_manager = MockManager( + task + ) # workspace doesn't need to be passed in, stays the same + print("Server unavailable, using mock", mock_function_name) + mock_manager.delegate(mock_function_name) + else: + print("No mock provided") + + # else: + # # This code is run if no exception occurred + # print(f"Request succeeded with status code {response.status_code}") regression_txt = "agbenchmark/tests/regression/regression_tests.txt" diff --git a/agbenchmark/mocks/MockManager.py b/agbenchmark/mocks/MockManager.py new file mode 100644 index 000000000..f4e7f5f5a --- /dev/null +++ b/agbenchmark/mocks/MockManager.py @@ -0,0 +1,28 @@ +import sys +import agbenchmark.mocks.tests.basic_mocks as basic_mocks +import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks + + +class MockManager: + def __init__(self, task: str): + self.task = task + self.workspace = "agbenchmark/mocks/workspace" + self.modules = [basic_mocks, retrieval_mocks] + + def delegate(self, mock_function_name, *args, **kwargs): + if hasattr(self, mock_function_name): + # Check if the mock function is an attribute of this class + getattr(self, mock_function_name)(*args, **kwargs) + elif mock_function_name in globals(): + # Check if the function is imported in the file + func = globals()[mock_function_name] + func(self.task, self.workspace, *args, **kwargs) + elif len(self.modules) > 0: + # checks if function is in imported modules + for module in self.modules: + if hasattr(module, mock_function_name): + func = getattr(module, mock_function_name) + func(self.task, self.workspace, *args, **kwargs) + return + else: + raise ValueError(f"No such mock: {mock_function_name}") diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/mocks/tests/retrieval_manual.py b/agbenchmark/mocks/tests/retrieval_manual.py deleted file mode 100644 index ccb482132..000000000 --- a/agbenchmark/mocks/tests/retrieval_manual.py +++ /dev/null @@ -1,10 +0,0 @@ -from ..basic_gpt_agent import basic_gpt_agent -from agbenchmark.Challenge import Challenge - - -def mock_retrieval(task: str, workspace: str): - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py new file mode 100644 index 000000000..23f4bde17 --- /dev/null +++ b/agbenchmark/mocks/tests/retrieval_mocks.py @@ -0,0 +1,13 @@ +from ..basic_gpt_agent import basic_gpt_agent +from agbenchmark.Challenge import Challenge + + +# TODO: Make it so that you can specify for tests to only run if their prerequisites are met. +# Prerequisites here would be writing to a file (basic_abilities test). +# Should also check if prerequisites exists in regression file +def retrieval_1_mock(task: str, workspace: str): + # Call the basic_gpt_agent to get a response. + response = basic_gpt_agent(task) + + # Open the file in write mode. + Challenge.write_to_file(workspace, "file_to_check.txt", response) -- cgit v1.2.3 From b6562f3420bd6a77d415d8d57d3a1c9a4f9ed354 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Fri, 23 Jun 2023 09:31:21 -0400 Subject: Update README.md --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b46562d2d..0a8d119af 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,16 @@ A repo built for the purpose of benchmarking the performance of agents far and w ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x -## Contributing +### To run the basic existing mock (June 21) -- Make sure you have `poetry` installed - `pip install poetry`. -- Then `poetry install` for dependencies +1. clone the repo `auto-gpt-benchmarks` +2. `pip install poetry` +3. `poetry shell` +4. `poetry install` +5. `agbenchmark start` + Keep config the same and watch the logs :) - To add requirements `poetry add requirement`. -- To run in venv `poetry run python script.py` Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. -- cgit v1.2.3 From a5073ab57790a84d146877e1b6512eecbfc12b09 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 09:42:36 -0400 Subject: basic challenges, more ChallengeData structure --- agbenchmark/Challenge.py | 22 ++++++++++++++++ agbenchmark/challenges/define_task_types.py | 16 ++++++++---- agbenchmark/challenges/retrieval/Retrieval.py | 22 +--------------- agbenchmark/challenges/retrieval/r1/r1_data.json | 10 +++++--- agbenchmark/challenges/retrieval/r1/r1_test.py | 6 +++-- agbenchmark/mocks/tests/basic_mocks.py | 28 +++++++++++++++++++++ agbenchmark/mocks/tests/retrieval_mocks.py | 7 +----- .../basic_abilities/read_file/r_file_data.json | 15 +++++++++++ .../basic_abilities/read_file/read_file_test.py | 29 ++++++++++++++++++++++ .../tests/basic_abilities/read_file_test.py | 0 .../basic_abilities/write_file/w_file_data.json | 16 ++++++++++++ .../basic_abilities/write_file/write_file_test.py | 27 ++++++++++++++++++++ .../tests/basic_abilities/write_file_test.py | 0 pyproject.toml | 3 ++- 14 files changed, 163 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 20bf55853..9828a0e9e 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,6 @@ import os from typing import Optional +from agbenchmark.challenges.define_task_types import Ground class Challenge: @@ -30,3 +31,24 @@ class Challenge: for filename in os.listdir(workspace) if os.path.isfile(os.path.join(workspace, filename)) ] + + def scoring(self, content: str, ground: Ground): + if ground.should_contain: + for should_contain_word in ground.should_contain: + if should_contain_word not in content: + return 0.0 + else: + print( + f"Word that should exist: {should_contain_word} exists in the content" + ) + + if ground.should_not_contain: + for should_not_contain_word in ground.should_not_contain: + if should_not_contain_word in content: + return 0.0 + else: + print( + f"Word that should not exist: {should_not_contain_word} does not exist in the content" + ) + + return 1.0 diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f1a841b53..879a46af0 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,6 +4,12 @@ import json import os +class Info(BaseModel): + difficulty: str + description: str + side_effects: List[str] + + class Ground(BaseModel): answer: str should_contain: Optional[List[str]] @@ -11,20 +17,20 @@ class Ground(BaseModel): files: List[str] -class Challenge(BaseModel): - category: str +class ChallengeData(BaseModel): + category: List[str] task: str ground: Ground - difficulty: str mock_func: Optional[str] = None + info: Info def serialize(self, path: str) -> None: with open(path, "w") as file: file.write(self.json()) @staticmethod - def deserialize(path: str) -> "Challenge": + def deserialize(path: str) -> "ChallengeData": print("Deserializing", path) with open(path, "r") as file: data = json.load(file) - return Challenge(**data) + return ChallengeData(**data) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 2db22ae4d..9434d69c3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,27 +1,7 @@ from agbenchmark.Challenge import Challenge -from agbenchmark.challenges.define_task_types import Ground class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" - def scoring(self, content: str, ground: Ground): - if ground.should_contain: - for should_contain_word in ground.should_contain: - if should_contain_word not in content: - return 0.0 - else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) - - if ground.should_not_contain: - for should_not_contain_word in ground.should_not_contain: - if should_not_contain_word in content: - return 0.0 - else: - print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" - ) - - return 1.0 + pass diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index c7cc31004..08b74d1b7 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,5 @@ { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -7,6 +7,10 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy", - "mock_func": "retrieval_1_mock" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index e20c9f7b9..d37c5e795 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,9 +1,11 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import Challenge, Ground +from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json")) +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r1_data.json") +) class TestRetrieval1(RetrievalChallenge): diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index e69de29bb..eb7b96541 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -0,0 +1,28 @@ +from agbenchmark.Challenge import Challenge +from ..basic_gpt_agent import basic_gpt_agent + + +def basic_read_file_mock(task: str, workspace: str): + """ + This mock reads a file and returns its content. + """ + + Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + + file_contents = Challenge.open_file(workspace, "file_to_check.txt") + + Challenge.write_to_file( + workspace, "file_to_check.txt", f"random string: {file_contents}" + ) + + +def basic_write_file_mock(task: str, workspace: str): + """ + This mock writes to a file (creates one if it doesn't exist) + """ + + # Call the basic_gpt_agent to get a response. + response = basic_gpt_agent(task) + + # Open the file in write mode. + Challenge.write_to_file(workspace, "file_to_check.txt", response) diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py index 23f4bde17..2481de060 100644 --- a/agbenchmark/mocks/tests/retrieval_mocks.py +++ b/agbenchmark/mocks/tests/retrieval_mocks.py @@ -1,4 +1,3 @@ -from ..basic_gpt_agent import basic_gpt_agent from agbenchmark.Challenge import Challenge @@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge # Prerequisites here would be writing to a file (basic_abilities test). # Should also check if prerequisites exists in regression file def retrieval_1_mock(task: str, workspace: str): - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json new file mode 100644 index 000000000..55319ddfc --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -0,0 +1,15 @@ +{ + "category": ["basic"], + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "ground": { + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_read_file_mock", + "info": { + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] + } +} diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py new file mode 100644 index 000000000..610ccdab6 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -0,0 +1,29 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") +) + + +class TestReadFile(Challenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval( + self, workspace + ): # create_file simply there for the function to depend on the fixture + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json new file mode 100644 index 000000000..4aaa1347d --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -0,0 +1,16 @@ +{ + "category": ["basic"], + "task": "What is the capital of America?", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py new file mode 100644 index 000000000..ccb10fe70 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -0,0 +1,27 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "w_file_data.json") +) + + +class TestWriteFile(Challenge): + """Testing if LLM can write to a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval(self, workspace): + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pyproject.toml b/pyproject.toml index 5498381a2..6f79e75ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,8 @@ testpaths = [ ] markers = [ "retrieval", - "regression" + "regression", + "basic" ] [tool.poetry.scripts] -- cgit v1.2.3 From 66c9e68b0430066d23e9acd66e5259ea5d5190d7 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:15:53 -0400 Subject: file creation from within file before server :) --- agbenchmark/conftest.py | 2 +- agbenchmark/mocks/tests/basic_mocks.py | 2 +- agbenchmark/tests/basic_abilities/read_file/read_file_test.py | 8 ++++++++ agbenchmark/tests/regression/regression_tests.txt | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 908d39e89..434f6dbde 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -17,7 +17,7 @@ def config(): return config -@pytest.fixture +@pytest.fixture(scope="module") def workspace(config): yield config["workspace"] # teardown after test function completes diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index eb7b96541..bbff6a9c7 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") file_contents = Challenge.open_file(workspace, "file_to_check.txt") diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 610ccdab6..35d1d80c5 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -8,6 +8,14 @@ data = ChallengeData.deserialize( ) +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) + + class TestReadFile(Challenge): """Testing if LLM can read a file""" diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index e69de29bb..a5f8fbd1d 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -0,0 +1,2 @@ +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] -- cgit v1.2.3 From 4fa9f72083aa09bf1770f10a3254c4d0ef674a9a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:24:17 -0400 Subject: adding dependencies on other challenges --- agbenchmark/mocks/tests/basic_mocks.py | 2 -- .../tests/basic_abilities/read_file/read_file_test.py | 1 + .../tests/basic_abilities/write_file/write_file_test.py | 1 + agbenchmark/tests/regression/regression_tests.txt | 1 - poetry.lock | 15 ++++++++++++++- pyproject.toml | 1 + 6 files changed, 17 insertions(+), 4 deletions(-) diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index bbff6a9c7..550095b72 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") - file_contents = Challenge.open_file(workspace, "file_to_check.txt") Challenge.write_to_file( diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 35d1d80c5..ea794281e 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,7 @@ class TestReadFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(depends=["write_file"]) def test_retrieval( self, workspace ): # create_file simply there for the function to depend on the fixture diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index ccb10fe70..b2c559c9e 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -17,6 +17,7 @@ class TestWriteFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(name="write_file") def test_retrieval(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index a5f8fbd1d..84e625af4 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,2 +1 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] diff --git a/poetry.lock b/poetry.lock index 3f1059aaf..3bc37622e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-dependency" +version = "0.5.1" +description = "Manage dependencies of tests" +optional = false +python-versions = "*" +files = [ + {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, +] + +[package.dependencies] +pytest = ">=3.6.0" + [[package]] name = "requests" version = "2.31.0" @@ -765,4 +778,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38" +content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" diff --git a/pyproject.toml b/pyproject.toml index 6f79e75ce..087ac8447 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" +pytest-dependency = "^0.5.1" [build-system] -- cgit v1.2.3 From f895d54e02c92e262172d9a773f7e6a4870d435d Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 14:42:35 -0400 Subject: more elegant marking & dependency solution --- README.md | 74 +++++++++++++++++++--- agbenchmark/challenges/README.md | 38 +++++------ agbenchmark/challenges/define_task_types.py | 1 + agbenchmark/challenges/retrieval/r1/r1_data.json | 1 + .../tests/basic_abilities/BasicChallenge.py | 7 ++ .../basic_abilities/read_file/r_file_data.json | 1 + .../basic_abilities/read_file/read_file_test.py | 12 ++-- .../basic_abilities/write_file/w_file_data.json | 1 + .../basic_abilities/write_file/write_file_test.py | 9 ++- agbenchmark/tests/regression/regression_tests.txt | 2 + poetry.lock | 17 ++++- pyproject.toml | 1 + 12 files changed, 126 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py diff --git a/README.md b/README.md index 0a8d119af..0ad0cf345 100644 --- a/README.md +++ b/README.md @@ -51,15 +51,73 @@ Share your progress :) to create a test: -``` -@pytest.mark.parametrize( -"server_response", -["VARIABLE"], # VARIABLE = the query/goal you provide to the model -indirect=True, +```python +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from ..CategoryChallenge import CategoryChallenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") ) -@pytest.mark.(VARIABLE) # VARIABLE = category of the test -def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts -assert os.path.exists(os.path.join(workspace, "file_to_check.txt")) + +class TestSomething(CategoryChallenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + def test_retrieval( + self, workspace + ): + # scoring logic goes here +``` + +All challenges will inherit from parent class which has the mark + +```python +@pytest.mark.basic +class BasicChallenge(Challenge): + pass +``` + +If you want to add a custom mark to a Challenge, you must specify it before the test definition + +```python +@pytest.mark.other_mark +def test_retrieval(self, workspace): +``` + +To add a dependency to a challenge use the following + +```python +# to defining what a test depends on +from pytest_dependency import depends + +def test1(self, request, workspace): + depends(request, data.dependencies) +# for defining a test as a dependency +@pytest.mark.dependency() +def test2 +``` + +Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards + +```python +@pytest.mark.run(order=1) +``` + +To create a file to test a challenge, add this to the challenge file which will create a file before running the server + +```python +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) ``` ## Api diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 50efe2c4d..d5229e937 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,28 +4,25 @@ Input: -- **category** (str): information-retrieval -- **difficulty**(str): the difficulty of this query. choices from - -## Information-retrieval challenges - -Input: - -- **category** (str): information-retrieval -- **task** (str): the question the agent needs to be solve. +- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **task** (str): The task that the agent needs to solve. +- **dependencies** (str[]): The dependencies that the challenge needs to run. - **ground** (dict): The ground truth. - - **answer** (str): The raw text of ground truth answer - - **should_contain** (list): the exact strings that is required in the final answer - - **should_not_contain** (list): the exact strings that should not be in the final answer - - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt -- **difficulty**(str): the difficulty of this query. choices from -- **mock_func**: function to mock the agent's response. This is used for testing purposes + - **answer** (str): The raw text of the ground truth answer. + - **should_contain** (list): The exact strings that are required in the final answer. + - **should_not_contain** (list): The exact strings that should not be in the final answer. + - **files** (list): Files that are used for retrieval. Can specify file here or an extension. +- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **info** (dict): Additional info about the challenge. + - **difficulty** (str): The difficulty of this query. + - **description** (str): Description of the challenge. + - **side_effects** (str[]): Describes the effects of the challenge. Example: ```python { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -33,11 +30,16 @@ Example: "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } ``` -Output: +Current Output: - **score** (float): scores range from [0, 1] diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 879a46af0..694671218 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -20,6 +20,7 @@ class Ground(BaseModel): class ChallengeData(BaseModel): category: List[str] task: str + dependencies: List[str] ground: Ground mock_func: Optional[str] = None info: Info diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 08b74d1b7..fe05b6d51 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,6 @@ { "category": ["basic"], + "dependencies": ["test_write_file"], "task": "What is the capital of America?", "ground": { "answer": "Washington", diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py new file mode 100644 index 000000000..563207405 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -0,0 +1,7 @@ +import pytest +from agbenchmark.Challenge import Challenge + + +@pytest.mark.basic +class BasicChallenge(Challenge): + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 55319ddfc..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ea794281e..03b2d6cab 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,7 +1,9 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os +from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -16,7 +18,7 @@ def setup_module(workspace): ) -class TestReadFile(Challenge): +class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" @pytest.mark.parametrize( @@ -24,11 +26,9 @@ class TestReadFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(depends=["write_file"]) - def test_retrieval( - self, workspace - ): # create_file simply there for the function to depend on the fixture + def test_read_file(self, request, workspace): + depends(request, data.dependencies) + file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 4aaa1347d..562d1c364 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b2c559c9e..b09162e3d 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,6 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData -from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os data = ChallengeData.deserialize( @@ -8,7 +8,7 @@ data = ChallengeData.deserialize( ) -class TestWriteFile(Challenge): +class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" @pytest.mark.parametrize( @@ -16,9 +16,8 @@ class TestWriteFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(name="write_file") - def test_retrieval(self, workspace): + @pytest.mark.dependency() + def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 84e625af4..b831003fc 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1 +1,3 @@ agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 3bc37622e..f6f24c5f2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,21 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-ordering" +version = "0.6" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = "*" +files = [ + {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, + {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, + {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "requests" version = "2.31.0" @@ -778,4 +793,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" +content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" diff --git a/pyproject.toml b/pyproject.toml index 087ac8447..faee61c2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" +pytest-ordering = "^0.6" [build-system] -- cgit v1.2.3 From d1c5e0a91a7a0f23b0e8de5f394204e96ec668cd Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 00:22:53 -0400 Subject: finally figured out right way to do dependencies --- agbenchmark/challenges/retrieval/Retrieval.py | 2 ++ agbenchmark/challenges/retrieval/r1/r1_data.json | 4 ++-- agbenchmark/challenges/retrieval/r1/r1_test.py | 6 ++++-- agbenchmark/tests/basic_abilities/BasicChallenge.py | 1 + .../tests/basic_abilities/read_file/r_file_data.json | 4 +++- .../tests/basic_abilities/read_file/read_file_test.py | 6 ++---- .../basic_abilities/write_file/write_file_test.py | 1 - agbenchmark/tests/regression/regression_tests.txt | 4 ++-- poetry.lock | 19 ++++++++++++++++++- pyproject.toml | 3 ++- 10 files changed, 36 insertions(+), 14 deletions(-) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 9434d69c3..b8aa81ce3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,6 +1,8 @@ from agbenchmark.Challenge import Challenge +import pytest +@pytest.mark.retrieval class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index fe05b6d51..562d1c364 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,14 +1,14 @@ { "category": ["basic"], - "dependencies": ["test_write_file"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_write_file_mock", "info": { "difficulty": "easy", "description": "Tests the writing to file", diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index d37c5e795..5e6d6abf4 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,6 +2,8 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os +from pytest_dependency import depends + data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r1_data.json") @@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.retrieval - def test_retrieval(self, workspace): + def test_retrieval(self, request, workspace): + depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 563207405..0cada86cc 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,6 +2,7 @@ import pytest from agbenchmark.Challenge import Challenge +@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 8c5ef62db..4d04f33e7 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,7 +1,9 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["test_write_file"], + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 03b2d6cab..ad08da4e0 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_read_file(self, request, workspace): - depends(request, data.dependencies) - + @pytest.mark.order(after=data.dependencies) + def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b09162e3d..4c94320e0 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.dependency() def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index b831003fc..df27f3124 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] +agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index f6f24c5f2..4764bf493 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,23 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-order" +version = "1.1.0" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, + {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, +] + +[package.dependencies] +pytest = [ + {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, +] + [[package]] name = "pytest-ordering" version = "0.6" @@ -793,4 +810,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" +content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" diff --git a/pyproject.toml b/pyproject.toml index faee61c2d..fd2c52041 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" pytest-ordering = "^0.6" +pytest-order = "^1.1.0" [build-system] @@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "-ra -q" +addopts = "--order-dependencies" # -ra -q testpaths = [ "tests", "agbenchmark", ] -- cgit v1.2.3 From 31c11927199714516891db5aa3044eb1a4396eb4 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 08:48:16 -0400 Subject: other was non solution, solution is pytest-depends --- agbenchmark/challenges/README.md | 20 +++--- agbenchmark/challenges/retrieval/r1/r1_test.py | 2 - .../tests/basic_abilities/BasicChallenge.py | 1 - .../basic_abilities/read_file/r_file_data.json | 4 +- .../basic_abilities/read_file/read_file_test.py | 2 +- .../basic_abilities/write_file/write_file_test.py | 1 + agbenchmark/tests/regression/regression_tests.txt | 2 +- poetry.lock | 80 ++++++++++++---------- pyproject.toml | 6 +- 9 files changed, 59 insertions(+), 59 deletions(-) diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index d5229e937..e457b85c4 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -6,7 +6,7 @@ Input: - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. -- **dependencies** (str[]): The dependencies that the challenge needs to run. +- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. - **answer** (str): The raw text of the ground truth answer. - **should_contain** (list): The exact strings that are required in the final answer. @@ -23,18 +23,20 @@ Example: ```python { "category": ["basic"], - "task": "What is the capital of America?", + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_read_file_mock", "info": { - "difficulty": "easy", - "description": "Tests the writing to file", - "side_effects": ["tests if there is in fact an LLM attached"] + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 5e6d6abf4..45becaf75 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -from pytest_dependency import depends data = ChallengeData.deserialize( @@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, request, workspace): - depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 0cada86cc..563207405 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.Challenge import Challenge -@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 4d04f33e7..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,9 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ad08da4e0..494a9b071 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.order(after=data.dependencies) + @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 4c94320e0..0a4ef4a2c 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index df27f3124..57b94cd7a 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 4764bf493..d7939fbfe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -368,6 +368,20 @@ files = [ {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, ] +[[package]] +name = "future-fstrings" +version = "1.2.0" +description = "A backport of fstrings to python<3.6" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"}, + {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"}, +] + +[package.extras] +rewrite = ["tokenize-rt (>=3)"] + [[package]] name = "idna" version = "3.4" @@ -473,6 +487,24 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "openai" version = "0.27.8" @@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] -name = "pytest-dependency" -version = "0.5.1" -description = "Manage dependencies of tests" -optional = false -python-versions = "*" -files = [ - {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, -] - -[package.dependencies] -pytest = ">=3.6.0" - -[[package]] -name = "pytest-order" -version = "1.1.0" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, - {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, -] - -[package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] - -[[package]] -name = "pytest-ordering" -version = "0.6" -description = "pytest plugin to run your tests in a specific order" +name = "pytest-depends" +version = "1.0.1" +description = "Tests that depend on other tests" optional = false python-versions = "*" files = [ - {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, - {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, - {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, + {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"}, + {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"}, ] [package.dependencies] -pytest = "*" +colorama = "*" +future-fstrings = "*" +networkx = "*" +pytest = ">=3" [[package]] name = "requests" @@ -810,4 +814,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" +content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c" diff --git a/pyproject.toml b/pyproject.toml index fd2c52041..0a4f8ba73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" -pytest-dependency = "^0.5.1" -pytest-ordering = "^0.6" -pytest-order = "^1.1.0" +pytest-depends = "^1.0.1" [build-system] @@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "--order-dependencies" # -ra -q +addopts = "-ra -q" testpaths = [ "tests", "agbenchmark", ] -- cgit v1.2.3 From adc6b225a6063bc2b0981f1156f25bde9279040e Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 11:12:33 -0400 Subject: update regression tests info --- agbenchmark/challenges/retrieval/r1/r1_test.py | 7 ++++- agbenchmark/conftest.py | 36 +++++++++++++++------- .../basic_abilities/read_file/read_file_test.py | 5 +++ .../basic_abilities/write_file/w_file_data.json | 2 +- .../basic_abilities/write_file/write_file_test.py | 5 +++ agbenchmark/tests/regression/RegressionManager.py | 25 +++++++++------ agbenchmark/tests/regression/regression_tests.json | 1 + agbenchmark/tests/regression/regression_tests.txt | 17 ++++++++-- 8 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 agbenchmark/tests/regression/regression_tests.json diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 45becaf75..489d298fb 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_retrieval(self, request, workspace): + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) + def test_retrieval(self, workspace, current_challenge_data): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 434f6dbde..78114c204 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager +from agbenchmark.challenges.define_task_types import ChallengeData @pytest.fixture(scope="module") @@ -64,21 +65,34 @@ def server_response(request, config): # print(f"Request succeeded with status code {response.status_code}") -regression_txt = "agbenchmark/tests/regression/regression_tests.txt" +regression_json = "agbenchmark/tests/regression/regression_tests.json" -regression_manager = RegressionManager(regression_txt) +regression_manager = RegressionManager(regression_json) + + +# this is to get the challenge_data from every test +@pytest.fixture(autouse=True) +def regression_data(request): + return request.param def pytest_runtest_makereport(item, call): - """Called for each test report. Generated for each stage - of a test run (setup, call, teardown).""" if call.when == "call": - if ( - call.excinfo is None - ): # if no error in the call stage, add it as a regression test - regression_manager.add_test(item.nodeid) - else: # otherwise, :( - regression_manager.remove_test(item.nodeid) + challenge_data = item.funcargs.get("regression_data", None) + difficulty = challenge_data.info.difficulty if challenge_data else "unknown" + dependencies = challenge_data.dependencies if challenge_data else [] + + test_details = { + "difficulty": difficulty, + "dependencies": dependencies, + "test": item.nodeid, + } + + print("pytest_runtest_makereport", test_details) + if call.excinfo is None: + regression_manager.add_test(item.nodeid.split("::")[1], test_details) + else: + regression_manager.remove_test(item.nodeid.split("::")[1]) def pytest_collection_modifyitems(items): @@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items): to add regression marker to collected test items.""" for item in items: print("pytest_collection_modifyitems", item.nodeid) - if item.nodeid + "\n" in regression_manager.tests: + if item.nodeid.split("::")[1] in regression_manager.tests: print(regression_manager.tests) item.add_marker(pytest.mark.regression) diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 494a9b071..7d14228c8 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 562d1c364..1d2621081 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -10,7 +10,7 @@ }, "mock_func": "basic_write_file_mock", "info": { - "difficulty": "easy", + "difficulty": "basic", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 0a4ef4a2c..330128898 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py index 9117d53f1..a1379ecae 100644 --- a/agbenchmark/tests/regression/RegressionManager.py +++ b/agbenchmark/tests/regression/RegressionManager.py @@ -1,3 +1,6 @@ +import json + + class RegressionManager: """Abstracts interaction with the regression tests file""" @@ -6,17 +9,21 @@ class RegressionManager: self.load() def load(self) -> None: - with open(self.filename, "r") as f: - self.tests = f.readlines() + try: + with open(self.filename, "r") as f: + self.tests = json.load(f) + except (FileNotFoundError, json.decoder.JSONDecodeError): + self.tests = {} def save(self) -> None: with open(self.filename, "w") as f: - f.writelines(self.tests) + json.dump(self.tests, f, indent=4) - def add_test(self, test_id) -> None: - if f"{test_id}\n" not in self.tests: - self.tests.append(f"{test_id}\n") + def add_test(self, test_name: str, test_details: dict) -> None: + self.tests[test_name] = test_details + self.save() - def remove_test(self, test_id) -> None: - if f"{test_id}\n" in self.tests: - self.tests.remove(f"{test_id}\n") + def remove_test(self, test_name: str) -> None: + if test_name in self.tests: + del self.tests[test_name] + self.save() diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/agbenchmark/tests/regression/regression_tests.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 57b94cd7a..8af722f07 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,14 @@ -agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] +{ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": { + "difficulty": "easy", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file -- cgit v1.2.3 From 7604ae07bb6d79cfe8e5a28fdf3fa85c83603b1b Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 19:30:04 -0400 Subject: can now put file extensions or names in files data --- agbenchmark/Challenge.py | 22 +++++++++++++++++++++- agbenchmark/challenges/retrieval/r1/r1_test.py | 12 +++++++----- .../basic_abilities/read_file/read_file_test.py | 12 +++++++----- .../basic_abilities/write_file/w_file_data.json | 2 +- .../basic_abilities/write_file/write_file_test.py | 12 +++++++----- agbenchmark/tests/regression/regression_tests.json | 15 ++++++++++++++- 6 files changed, 57 insertions(+), 18 deletions(-) diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 9828a0e9e..d159296b1 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,5 @@ import os -from typing import Optional +import glob from agbenchmark.challenges.define_task_types import Ground @@ -14,6 +14,26 @@ class Challenge: with open(workspace_dir, "r") as f: return f.read() + @staticmethod + def open_files(workspace: str, file_patterns: list): + script_dir = os.path.abspath(workspace) + files_contents = [] + + for file_pattern in file_patterns: + # Check if it is a file extension + if file_pattern.startswith("."): + # Find all files with the given extension in the workspace + matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern)) + else: + # Otherwise, it is a specific file + matching_files = [os.path.join(script_dir, file_pattern)] + + for file_path in matching_files: + with open(file_path, "r") as f: + files_contents.append(f.read()) + + return files_contents + @staticmethod def write_to_file(workspace: str, filename: str, content: str): script_dir = os.path.abspath(workspace) diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 489d298fb..2a7d92a71 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, workspace, current_challenge_data): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 7d14228c8..90946670c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge): ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 1d2621081..037c5bd88 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -6,7 +6,7 @@ "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": ["file_to_check.txt"] + "files": [".txt"] }, "mock_func": "basic_write_file_mock", "info": { diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 330128898..187378ff1 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge): ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 9e26dfeeb..c84fc9c99 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1 +1,14 @@ -{} \ No newline at end of file +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file -- cgit v1.2.3 From 4be22ae5abc884404370196bf71da86affe82131 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 26 Jun 2023 09:27:20 -0400 Subject: mini agi attempt --- agbenchmark/conftest.py | 46 +++++++++++++--------- agbenchmark/tests/regression/regression_tests.json | 15 +------ agent/agbenchmark_run.py | 27 +++++++++++++ agent/mini-agi | 1 + 4 files changed, 56 insertions(+), 33 deletions(-) create mode 100644 agent/agbenchmark_run.py create mode 160000 agent/mini-agi diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 78114c204..b3b69f194 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -7,6 +7,7 @@ import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager from agbenchmark.challenges.define_task_types import ChallengeData +import subprocess @pytest.fixture(scope="module") @@ -42,27 +43,34 @@ def server_response(request, config): else: task = request.param mock_function_name = None - # print(f"Server starting at {request.module}") - # try: - # response = requests.post( - # f"{config['hostname']}:{config['port']}", data={"task": task} - # ) - # response.raise_for_status() # This will raise an HTTPError if the status is 4xx or 5xx - # except RequestException: - # # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock - - if mock_function_name: - mock_manager = MockManager( - task - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_function_name) - mock_manager.delegate(mock_function_name) - else: - print("No mock provided") + # get the current file's directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + + # construct the script's path + script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py") + + # form the command + command = ["python", script_path, task] + + # if mock_function_name: + # mock_manager = MockManager( + # task + # ) # workspace doesn't need to be passed in, stays the same + # print("Server unavailable, using mock", mock_function_name) + # mock_manager.delegate(mock_function_name) # else: - # # This code is run if no exception occurred - # print(f"Request succeeded with status code {response.status_code}") + # print("No mock provided") + + try: + # run the command and wait for it to complete + result = subprocess.run( + command, shell=True, check=True, text=True, capture_output=True + ) + return result + except subprocess.CalledProcessError as e: + print(f"Subprocess failed with the following error:\n{e}") + # If the subprocess returns a non-zero exit status regression_json = "agbenchmark/tests/regression/regression_tests.json" diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index c84fc9c99..9e26dfeeb 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1,14 +1 @@ -{ - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "test_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py new file mode 100644 index 000000000..f509f5e66 --- /dev/null +++ b/agent/agbenchmark_run.py @@ -0,0 +1,27 @@ +import argparse +import subprocess +import os + + +def main(objective): + # get the current directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + + # form the command + command = ( + f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}" + ) + + # run the command + subprocess.run(command, shell=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.") + parser.add_argument( + "objective", type=str, help="The objective to pass to miniagi.py" + ) + + args = parser.parse_args() + + main(args.objective) diff --git a/agent/mini-agi b/agent/mini-agi new file mode 160000 index 000000000..d2add8f18 --- /dev/null +++ b/agent/mini-agi @@ -0,0 +1 @@ +Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b -- cgit v1.2.3 From 84f170c9e0b310219566dbe9538ca1755019f424 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 26 Jun 2023 09:36:13 -0400 Subject: fixing relative imports --- agbenchmark/start_benchmark.py | 15 +++++++++++---- agent/mini-agi | 1 + 2 files changed, 12 insertions(+), 4 deletions(-) create mode 160000 agent/mini-agi diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index b7a116ebc..3a6a2b860 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -17,8 +17,10 @@ def start(category, noreg): """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" config_file = "agbenchmark/config.json" + config_dir = os.path.abspath(config_file) + # Check if configuration file exists and is not empty - if not os.path.exists(config_file) or os.stat(config_file).st_size == 0: + if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0: config = {} config["hostname"] = click.prompt( @@ -26,16 +28,21 @@ def start(category, noreg): ) config["port"] = click.prompt("Please enter a new port", default=8080) config["workspace"] = click.prompt( - "Please enter a new workspace path", default="/path/to/workspace" + "Please enter a new workspace path", default="agbenchmark/mocks/workspace" ) - with open(config_file, "w") as f: + with open(config_dir, "w") as f: json.dump(config, f) else: # If the configuration file exists and is not empty, load it - with open(config_file, "r") as f: + with open(config_dir, "r") as f: config = json.load(f) + # create workspace directory if it doesn't exist + workspace_path = config_dir = os.path.abspath(config["workspace"]) + if not os.path.exists(workspace_path): + os.makedirs(workspace_path, exist_ok=True) + print("Current configuration:") for key, value in config.items(): print(f"{key}: {value}") diff --git a/agent/mini-agi b/agent/mini-agi new file mode 160000 index 000000000..d2add8f18 --- /dev/null +++ b/agent/mini-agi @@ -0,0 +1 @@ +Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b -- cgit v1.2.3 From a7972ad8737a8c5cebd3768f02013056c7594c93 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 27 Jun 2023 13:25:47 -0400 Subject: regression test creation --- agbenchmark/start_benchmark.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 3a6a2b860..6adcc09bf 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -39,10 +39,17 @@ def start(category, noreg): config = json.load(f) # create workspace directory if it doesn't exist - workspace_path = config_dir = os.path.abspath(config["workspace"]) + workspace_path = os.path.abspath(config["workspace"]) if not os.path.exists(workspace_path): os.makedirs(workspace_path, exist_ok=True) + regression_path = os.path.abspath( + "agbenchmark/tests/regression/regression_tests.txt" + ) + if not os.path.exists(regression_path): + with open(regression_path, "a"): + pass + print("Current configuration:") for key, value in config.items(): print(f"{key}: {value}") -- cgit v1.2.3 From 8c44b9eddf7c566d5e39f7e11149772b96e23a5f Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 09:42:36 -0400 Subject: basic challenges, more ChallengeData structure --- agbenchmark/Challenge.py | 22 ++++++++++++++++ agbenchmark/challenges/define_task_types.py | 16 ++++++++---- agbenchmark/challenges/retrieval/Retrieval.py | 22 +--------------- agbenchmark/challenges/retrieval/r1/r1_data.json | 10 +++++--- agbenchmark/challenges/retrieval/r1/r1_test.py | 6 +++-- agbenchmark/mocks/tests/basic_mocks.py | 28 +++++++++++++++++++++ agbenchmark/mocks/tests/retrieval_mocks.py | 7 +----- .../basic_abilities/read_file/r_file_data.json | 15 +++++++++++ .../basic_abilities/read_file/read_file_test.py | 29 ++++++++++++++++++++++ .../tests/basic_abilities/read_file_test.py | 0 .../basic_abilities/write_file/w_file_data.json | 16 ++++++++++++ .../basic_abilities/write_file/write_file_test.py | 27 ++++++++++++++++++++ .../tests/basic_abilities/write_file_test.py | 0 pyproject.toml | 3 ++- 14 files changed, 163 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 20bf55853..9828a0e9e 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,6 @@ import os from typing import Optional +from agbenchmark.challenges.define_task_types import Ground class Challenge: @@ -30,3 +31,24 @@ class Challenge: for filename in os.listdir(workspace) if os.path.isfile(os.path.join(workspace, filename)) ] + + def scoring(self, content: str, ground: Ground): + if ground.should_contain: + for should_contain_word in ground.should_contain: + if should_contain_word not in content: + return 0.0 + else: + print( + f"Word that should exist: {should_contain_word} exists in the content" + ) + + if ground.should_not_contain: + for should_not_contain_word in ground.should_not_contain: + if should_not_contain_word in content: + return 0.0 + else: + print( + f"Word that should not exist: {should_not_contain_word} does not exist in the content" + ) + + return 1.0 diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f1a841b53..879a46af0 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,6 +4,12 @@ import json import os +class Info(BaseModel): + difficulty: str + description: str + side_effects: List[str] + + class Ground(BaseModel): answer: str should_contain: Optional[List[str]] @@ -11,20 +17,20 @@ class Ground(BaseModel): files: List[str] -class Challenge(BaseModel): - category: str +class ChallengeData(BaseModel): + category: List[str] task: str ground: Ground - difficulty: str mock_func: Optional[str] = None + info: Info def serialize(self, path: str) -> None: with open(path, "w") as file: file.write(self.json()) @staticmethod - def deserialize(path: str) -> "Challenge": + def deserialize(path: str) -> "ChallengeData": print("Deserializing", path) with open(path, "r") as file: data = json.load(file) - return Challenge(**data) + return ChallengeData(**data) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 2db22ae4d..9434d69c3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,27 +1,7 @@ from agbenchmark.Challenge import Challenge -from agbenchmark.challenges.define_task_types import Ground class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" - def scoring(self, content: str, ground: Ground): - if ground.should_contain: - for should_contain_word in ground.should_contain: - if should_contain_word not in content: - return 0.0 - else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) - - if ground.should_not_contain: - for should_not_contain_word in ground.should_not_contain: - if should_not_contain_word in content: - return 0.0 - else: - print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" - ) - - return 1.0 + pass diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index c7cc31004..08b74d1b7 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,5 @@ { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -7,6 +7,10 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy", - "mock_func": "retrieval_1_mock" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index e20c9f7b9..d37c5e795 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,9 +1,11 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import Challenge, Ground +from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json")) +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r1_data.json") +) class TestRetrieval1(RetrievalChallenge): diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index e69de29bb..eb7b96541 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -0,0 +1,28 @@ +from agbenchmark.Challenge import Challenge +from ..basic_gpt_agent import basic_gpt_agent + + +def basic_read_file_mock(task: str, workspace: str): + """ + This mock reads a file and returns its content. + """ + + Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + + file_contents = Challenge.open_file(workspace, "file_to_check.txt") + + Challenge.write_to_file( + workspace, "file_to_check.txt", f"random string: {file_contents}" + ) + + +def basic_write_file_mock(task: str, workspace: str): + """ + This mock writes to a file (creates one if it doesn't exist) + """ + + # Call the basic_gpt_agent to get a response. + response = basic_gpt_agent(task) + + # Open the file in write mode. + Challenge.write_to_file(workspace, "file_to_check.txt", response) diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py index 23f4bde17..2481de060 100644 --- a/agbenchmark/mocks/tests/retrieval_mocks.py +++ b/agbenchmark/mocks/tests/retrieval_mocks.py @@ -1,4 +1,3 @@ -from ..basic_gpt_agent import basic_gpt_agent from agbenchmark.Challenge import Challenge @@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge # Prerequisites here would be writing to a file (basic_abilities test). # Should also check if prerequisites exists in regression file def retrieval_1_mock(task: str, workspace: str): - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json new file mode 100644 index 000000000..55319ddfc --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -0,0 +1,15 @@ +{ + "category": ["basic"], + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "ground": { + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_read_file_mock", + "info": { + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] + } +} diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py new file mode 100644 index 000000000..610ccdab6 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -0,0 +1,29 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") +) + + +class TestReadFile(Challenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval( + self, workspace + ): # create_file simply there for the function to depend on the fixture + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json new file mode 100644 index 000000000..4aaa1347d --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -0,0 +1,16 @@ +{ + "category": ["basic"], + "task": "What is the capital of America?", + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": ["file_to_check.txt"] + }, + "mock_func": "basic_write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py new file mode 100644 index 000000000..ccb10fe70 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -0,0 +1,27 @@ +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from agbenchmark.Challenge import Challenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "w_file_data.json") +) + + +class TestWriteFile(Challenge): + """Testing if LLM can write to a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + @pytest.mark.basic + def test_retrieval(self, workspace): + file = self.open_file(workspace, data.ground.files[0]) + + score = self.scoring(file, data.ground) + + print("You score is:", score) + + assert score diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pyproject.toml b/pyproject.toml index 5498381a2..6f79e75ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,8 @@ testpaths = [ ] markers = [ "retrieval", - "regression" + "regression", + "basic" ] [tool.poetry.scripts] -- cgit v1.2.3 From 22458a04e81f6a4e200581fe4046182b96f6e17c Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:15:53 -0400 Subject: file creation from within file before server :) --- agbenchmark/conftest.py | 2 +- agbenchmark/mocks/tests/basic_mocks.py | 2 +- agbenchmark/tests/basic_abilities/read_file/read_file_test.py | 8 ++++++++ agbenchmark/tests/regression/regression_tests.txt | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 908d39e89..434f6dbde 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -17,7 +17,7 @@ def config(): return config -@pytest.fixture +@pytest.fixture(scope="module") def workspace(config): yield config["workspace"] # teardown after test function completes diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index eb7b96541..bbff6a9c7 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") + # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") file_contents = Challenge.open_file(workspace, "file_to_check.txt") diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 610ccdab6..35d1d80c5 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -8,6 +8,14 @@ data = ChallengeData.deserialize( ) +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) + + class TestReadFile(Challenge): """Testing if LLM can read a file""" diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index e69de29bb..a5f8fbd1d 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -0,0 +1,2 @@ +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] -- cgit v1.2.3 From 60a7ac2343df15127e38da5d490edab887f81608 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 12:24:17 -0400 Subject: adding dependencies on other challenges --- agbenchmark/mocks/tests/basic_mocks.py | 2 -- .../tests/basic_abilities/read_file/read_file_test.py | 1 + .../tests/basic_abilities/write_file/write_file_test.py | 1 + agbenchmark/tests/regression/regression_tests.txt | 1 - poetry.lock | 15 ++++++++++++++- pyproject.toml | 1 + 6 files changed, 17 insertions(+), 4 deletions(-) diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index bbff6a9c7..550095b72 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str): This mock reads a file and returns its content. """ - # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing") - file_contents = Challenge.open_file(workspace, "file_to_check.txt") Challenge.write_to_file( diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 35d1d80c5..ea794281e 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,7 @@ class TestReadFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(depends=["write_file"]) def test_retrieval( self, workspace ): # create_file simply there for the function to depend on the fixture diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index ccb10fe70..b2c559c9e 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -17,6 +17,7 @@ class TestWriteFile(Challenge): indirect=True, ) @pytest.mark.basic + @pytest.mark.dependency(name="write_file") def test_retrieval(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index a5f8fbd1d..84e625af4 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,2 +1 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] diff --git a/poetry.lock b/poetry.lock index 3f1059aaf..3bc37622e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-dependency" +version = "0.5.1" +description = "Manage dependencies of tests" +optional = false +python-versions = "*" +files = [ + {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, +] + +[package.dependencies] +pytest = ">=3.6.0" + [[package]] name = "requests" version = "2.31.0" @@ -765,4 +778,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38" +content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" diff --git a/pyproject.toml b/pyproject.toml index 6f79e75ce..087ac8447 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" +pytest-dependency = "^0.5.1" [build-system] -- cgit v1.2.3 From 2f28a66591ea37715282271ccf92560e89a7924a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 24 Jun 2023 14:42:35 -0400 Subject: more elegant marking & dependency solution --- README.md | 74 +++++++++++++++++++--- agbenchmark/challenges/README.md | 38 +++++------ agbenchmark/challenges/define_task_types.py | 1 + agbenchmark/challenges/retrieval/r1/r1_data.json | 1 + .../tests/basic_abilities/BasicChallenge.py | 7 ++ .../basic_abilities/read_file/r_file_data.json | 1 + .../basic_abilities/read_file/read_file_test.py | 12 ++-- .../basic_abilities/write_file/w_file_data.json | 1 + .../basic_abilities/write_file/write_file_test.py | 9 ++- agbenchmark/tests/regression/regression_tests.txt | 2 + poetry.lock | 17 ++++- pyproject.toml | 1 + 12 files changed, 126 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py diff --git a/README.md b/README.md index 0a8d119af..0ad0cf345 100644 --- a/README.md +++ b/README.md @@ -51,15 +51,73 @@ Share your progress :) to create a test: -``` -@pytest.mark.parametrize( -"server_response", -["VARIABLE"], # VARIABLE = the query/goal you provide to the model -indirect=True, +```python +import pytest +from agbenchmark.challenges.define_task_types import ChallengeData +from ..CategoryChallenge import CategoryChallenge +import os + +data = ChallengeData.deserialize( + os.path.join(os.path.dirname(__file__), "r_file_data.json") ) -@pytest.mark.(VARIABLE) # VARIABLE = category of the test -def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts -assert os.path.exists(os.path.join(workspace, "file_to_check.txt")) + +class TestSomething(CategoryChallenge): + """Testing if LLM can read a file""" + + @pytest.mark.parametrize( + "server_response", + [(data.task, data.mock_func)], + indirect=True, + ) + def test_retrieval( + self, workspace + ): + # scoring logic goes here +``` + +All challenges will inherit from parent class which has the mark + +```python +@pytest.mark.basic +class BasicChallenge(Challenge): + pass +``` + +If you want to add a custom mark to a Challenge, you must specify it before the test definition + +```python +@pytest.mark.other_mark +def test_retrieval(self, workspace): +``` + +To add a dependency to a challenge use the following + +```python +# to defining what a test depends on +from pytest_dependency import depends + +def test1(self, request, workspace): + depends(request, data.dependencies) +# for defining a test as a dependency +@pytest.mark.dependency() +def test2 +``` + +Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards + +```python +@pytest.mark.run(order=1) +``` + +To create a file to test a challenge, add this to the challenge file which will create a file before running the server + +```python +@pytest.fixture(scope="module", autouse=True) +def setup_module(workspace): + if data.ground.should_contain: + Challenge.write_to_file( + workspace, data.ground.files[0], "this is how we're doing" + ) ``` ## Api diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 50efe2c4d..d5229e937 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,28 +4,25 @@ Input: -- **category** (str): information-retrieval -- **difficulty**(str): the difficulty of this query. choices from - -## Information-retrieval challenges - -Input: - -- **category** (str): information-retrieval -- **task** (str): the question the agent needs to be solve. +- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **task** (str): The task that the agent needs to solve. +- **dependencies** (str[]): The dependencies that the challenge needs to run. - **ground** (dict): The ground truth. - - **answer** (str): The raw text of ground truth answer - - **should_contain** (list): the exact strings that is required in the final answer - - **should_not_contain** (list): the exact strings that should not be in the final answer - - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt -- **difficulty**(str): the difficulty of this query. choices from -- **mock_func**: function to mock the agent's response. This is used for testing purposes + - **answer** (str): The raw text of the ground truth answer. + - **should_contain** (list): The exact strings that are required in the final answer. + - **should_not_contain** (list): The exact strings that should not be in the final answer. + - **files** (list): Files that are used for retrieval. Can specify file here or an extension. +- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **info** (dict): Additional info about the challenge. + - **difficulty** (str): The difficulty of this query. + - **description** (str): Description of the challenge. + - **side_effects** (str[]): Describes the effects of the challenge. Example: ```python { - "category": "retrieval", + "category": ["basic"], "task": "What is the capital of America?", "ground": { "answer": "Washington", @@ -33,11 +30,16 @@ Example: "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "difficulty": "easy" + "mock_func": "write_file_mock", + "info": { + "difficulty": "easy", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } } ``` -Output: +Current Output: - **score** (float): scores range from [0, 1] diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 879a46af0..694671218 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -20,6 +20,7 @@ class Ground(BaseModel): class ChallengeData(BaseModel): category: List[str] task: str + dependencies: List[str] ground: Ground mock_func: Optional[str] = None info: Info diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 08b74d1b7..fe05b6d51 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,5 +1,6 @@ { "category": ["basic"], + "dependencies": ["test_write_file"], "task": "What is the capital of America?", "ground": { "answer": "Washington", diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py new file mode 100644 index 000000000..563207405 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -0,0 +1,7 @@ +import pytest +from agbenchmark.Challenge import Challenge + + +@pytest.mark.basic +class BasicChallenge(Challenge): + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 55319ddfc..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ea794281e..03b2d6cab 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,7 +1,9 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os +from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -16,7 +18,7 @@ def setup_module(workspace): ) -class TestReadFile(Challenge): +class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" @pytest.mark.parametrize( @@ -24,11 +26,9 @@ class TestReadFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(depends=["write_file"]) - def test_retrieval( - self, workspace - ): # create_file simply there for the function to depend on the fixture + def test_read_file(self, request, workspace): + depends(request, data.dependencies) + file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 4aaa1347d..562d1c364 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -1,6 +1,7 @@ { "category": ["basic"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b2c559c9e..b09162e3d 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,6 @@ import pytest from agbenchmark.challenges.define_task_types import ChallengeData -from agbenchmark.Challenge import Challenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os data = ChallengeData.deserialize( @@ -8,7 +8,7 @@ data = ChallengeData.deserialize( ) -class TestWriteFile(Challenge): +class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" @pytest.mark.parametrize( @@ -16,9 +16,8 @@ class TestWriteFile(Challenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.basic - @pytest.mark.dependency(name="write_file") - def test_retrieval(self, workspace): + @pytest.mark.dependency() + def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 84e625af4..b831003fc 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1 +1,3 @@ agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 3bc37622e..f6f24c5f2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,21 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-ordering" +version = "0.6" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = "*" +files = [ + {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, + {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, + {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, +] + +[package.dependencies] +pytest = "*" + [[package]] name = "requests" version = "2.31.0" @@ -778,4 +793,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d" +content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" diff --git a/pyproject.toml b/pyproject.toml index 087ac8447..faee61c2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" +pytest-ordering = "^0.6" [build-system] -- cgit v1.2.3 From 06a6f080543ddffd8baf3aaf51ec97ff1fce86b3 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 00:22:53 -0400 Subject: finally figured out right way to do dependencies --- agbenchmark/challenges/retrieval/Retrieval.py | 2 ++ agbenchmark/challenges/retrieval/r1/r1_data.json | 4 ++-- agbenchmark/challenges/retrieval/r1/r1_test.py | 6 ++++-- agbenchmark/tests/basic_abilities/BasicChallenge.py | 1 + .../tests/basic_abilities/read_file/r_file_data.json | 4 +++- .../tests/basic_abilities/read_file/read_file_test.py | 6 ++---- .../basic_abilities/write_file/write_file_test.py | 1 - agbenchmark/tests/regression/regression_tests.txt | 4 ++-- poetry.lock | 19 ++++++++++++++++++- pyproject.toml | 3 ++- 10 files changed, 36 insertions(+), 14 deletions(-) diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index 9434d69c3..b8aa81ce3 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,6 +1,8 @@ from agbenchmark.Challenge import Challenge +import pytest +@pytest.mark.retrieval class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index fe05b6d51..562d1c364 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,14 +1,14 @@ { "category": ["basic"], - "dependencies": ["test_write_file"], "task": "What is the capital of America?", + "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_write_file_mock", "info": { "difficulty": "easy", "description": "Tests the writing to file", diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index d37c5e795..5e6d6abf4 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,6 +2,8 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os +from pytest_dependency import depends + data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r1_data.json") @@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.retrieval - def test_retrieval(self, workspace): + def test_retrieval(self, request, workspace): + depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 563207405..0cada86cc 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,6 +2,7 @@ import pytest from agbenchmark.Challenge import Challenge +@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 8c5ef62db..4d04f33e7 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,7 +1,9 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["test_write_file"], + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 03b2d6cab..ad08da4e0 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -from pytest_dependency import depends data = ChallengeData.deserialize( os.path.join(os.path.dirname(__file__), "r_file_data.json") @@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_read_file(self, request, workspace): - depends(request, data.dependencies) - + @pytest.mark.order(after=data.dependencies) + def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index b09162e3d..4c94320e0 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.dependency() def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index b831003fc..df27f3124 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0] agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] +agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index f6f24c5f2..4764bf493 100644 --- a/poetry.lock +++ b/poetry.lock @@ -608,6 +608,23 @@ files = [ [package.dependencies] pytest = ">=3.6.0" +[[package]] +name = "pytest-order" +version = "1.1.0" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, + {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, +] + +[package.dependencies] +pytest = [ + {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, +] + [[package]] name = "pytest-ordering" version = "0.6" @@ -793,4 +810,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7" +content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" diff --git a/pyproject.toml b/pyproject.toml index faee61c2d..fd2c52041 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-dependency = "^0.5.1" pytest-ordering = "^0.6" +pytest-order = "^1.1.0" [build-system] @@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "-ra -q" +addopts = "--order-dependencies" # -ra -q testpaths = [ "tests", "agbenchmark", ] -- cgit v1.2.3 From a2f79760ce8abdddfc27c5b0b42a58df903b352c Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 08:48:16 -0400 Subject: other was non solution, solution is pytest-depends --- agbenchmark/challenges/README.md | 20 +++--- agbenchmark/challenges/retrieval/r1/r1_test.py | 2 - .../tests/basic_abilities/BasicChallenge.py | 1 - .../basic_abilities/read_file/r_file_data.json | 4 +- .../basic_abilities/read_file/read_file_test.py | 2 +- .../basic_abilities/write_file/write_file_test.py | 1 + agbenchmark/tests/regression/regression_tests.txt | 2 +- poetry.lock | 80 ++++++++++++---------- pyproject.toml | 6 +- 9 files changed, 59 insertions(+), 59 deletions(-) diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index d5229e937..e457b85c4 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -6,7 +6,7 @@ Input: - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. -- **dependencies** (str[]): The dependencies that the challenge needs to run. +- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. - **answer** (str): The raw text of the ground truth answer. - **should_contain** (list): The exact strings that are required in the final answer. @@ -23,18 +23,20 @@ Example: ```python { "category": ["basic"], - "task": "What is the capital of America?", + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": [ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" + ], "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "answer": "random string: this is how we're doing", + "should_contain": ["random string: this is how we're doing"], "files": ["file_to_check.txt"] }, - "mock_func": "write_file_mock", + "mock_func": "basic_read_file_mock", "info": { - "difficulty": "easy", - "description": "Tests the writing to file", - "side_effects": ["tests if there is in fact an LLM attached"] + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 5e6d6abf4..45becaf75 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -from pytest_dependency import depends data = ChallengeData.deserialize( @@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, request, workspace): - depends(request, data.dependencies) file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 0cada86cc..563207405 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -2,7 +2,6 @@ import pytest from agbenchmark.Challenge import Challenge -@pytest.mark.run(order=1) @pytest.mark.basic class BasicChallenge(Challenge): pass diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 4d04f33e7..8c5ef62db 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,9 +1,7 @@ { "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "dependencies": ["test_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index ad08da4e0..494a9b071 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) - @pytest.mark.order(after=data.dependencies) + @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 4c94320e0..0a4ef4a2c 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index df27f3124..57b94cd7a 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,3 @@ -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] +agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] diff --git a/poetry.lock b/poetry.lock index 4764bf493..d7939fbfe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -368,6 +368,20 @@ files = [ {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, ] +[[package]] +name = "future-fstrings" +version = "1.2.0" +description = "A backport of fstrings to python<3.6" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"}, + {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"}, +] + +[package.extras] +rewrite = ["tokenize-rt (>=3)"] + [[package]] name = "idna" version = "3.4" @@ -473,6 +487,24 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "networkx" +version = "3.1" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ + {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, + {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, +] + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "openai" version = "0.27.8" @@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] -name = "pytest-dependency" -version = "0.5.1" -description = "Manage dependencies of tests" -optional = false -python-versions = "*" -files = [ - {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"}, -] - -[package.dependencies] -pytest = ">=3.6.0" - -[[package]] -name = "pytest-order" -version = "1.1.0" -description = "pytest plugin to run your tests in a specific order" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, - {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, -] - -[package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] - -[[package]] -name = "pytest-ordering" -version = "0.6" -description = "pytest plugin to run your tests in a specific order" +name = "pytest-depends" +version = "1.0.1" +description = "Tests that depend on other tests" optional = false python-versions = "*" files = [ - {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"}, - {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"}, - {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"}, + {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"}, + {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"}, ] [package.dependencies] -pytest = "*" +colorama = "*" +future-fstrings = "*" +networkx = "*" +pytest = ">=3" [[package]] name = "requests" @@ -810,4 +814,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3" +content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c" diff --git a/pyproject.toml b/pyproject.toml index fd2c52041..0a4f8ba73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,7 @@ click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" -pytest-dependency = "^0.5.1" -pytest-ordering = "^0.6" -pytest-order = "^1.1.0" +pytest-depends = "^1.0.1" [build-system] @@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] minversion = "6.0" -addopts = "--order-dependencies" # -ra -q +addopts = "-ra -q" testpaths = [ "tests", "agbenchmark", ] -- cgit v1.2.3 From 2411c35d0eb0af6ff0fb4a64ac2b431ea2d41adb Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 11:12:33 -0400 Subject: update regression tests info --- agbenchmark/challenges/retrieval/r1/r1_test.py | 7 ++++- agbenchmark/conftest.py | 36 +++++++++++++++------- .../basic_abilities/read_file/read_file_test.py | 5 +++ .../basic_abilities/write_file/w_file_data.json | 2 +- .../basic_abilities/write_file/write_file_test.py | 5 +++ agbenchmark/tests/regression/RegressionManager.py | 25 +++++++++------ agbenchmark/tests/regression/regression_tests.json | 1 + agbenchmark/tests/regression/regression_tests.txt | 17 ++++++++-- 8 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 agbenchmark/tests/regression/regression_tests.json diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 45becaf75..489d298fb 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge): [(data.task, data.mock_func)], indirect=True, ) - def test_retrieval(self, request, workspace): + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) + def test_retrieval(self, workspace, current_challenge_data): file = self.open_file(workspace, data.ground.files[0]) score = self.scoring(file, data.ground) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 434f6dbde..78114c204 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager +from agbenchmark.challenges.define_task_types import ChallengeData @pytest.fixture(scope="module") @@ -64,21 +65,34 @@ def server_response(request, config): # print(f"Request succeeded with status code {response.status_code}") -regression_txt = "agbenchmark/tests/regression/regression_tests.txt" +regression_json = "agbenchmark/tests/regression/regression_tests.json" -regression_manager = RegressionManager(regression_txt) +regression_manager = RegressionManager(regression_json) + + +# this is to get the challenge_data from every test +@pytest.fixture(autouse=True) +def regression_data(request): + return request.param def pytest_runtest_makereport(item, call): - """Called for each test report. Generated for each stage - of a test run (setup, call, teardown).""" if call.when == "call": - if ( - call.excinfo is None - ): # if no error in the call stage, add it as a regression test - regression_manager.add_test(item.nodeid) - else: # otherwise, :( - regression_manager.remove_test(item.nodeid) + challenge_data = item.funcargs.get("regression_data", None) + difficulty = challenge_data.info.difficulty if challenge_data else "unknown" + dependencies = challenge_data.dependencies if challenge_data else [] + + test_details = { + "difficulty": difficulty, + "dependencies": dependencies, + "test": item.nodeid, + } + + print("pytest_runtest_makereport", test_details) + if call.excinfo is None: + regression_manager.add_test(item.nodeid.split("::")[1], test_details) + else: + regression_manager.remove_test(item.nodeid.split("::")[1]) def pytest_collection_modifyitems(items): @@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items): to add regression marker to collected test items.""" for item in items: print("pytest_collection_modifyitems", item.nodeid) - if item.nodeid + "\n" in regression_manager.tests: + if item.nodeid.split("::")[1] in regression_manager.tests: print(regression_manager.tests) item.add_marker(pytest.mark.regression) diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 494a9b071..7d14228c8 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 562d1c364..1d2621081 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -10,7 +10,7 @@ }, "mock_func": "basic_write_file_mock", "info": { - "difficulty": "easy", + "difficulty": "basic", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 0a4ef4a2c..330128898 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge): [(data.task, data.mock_func)], indirect=True, ) + @pytest.mark.parametrize( + "regression_data", + [data], + indirect=True, + ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): file = self.open_file(workspace, data.ground.files[0]) diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py index 9117d53f1..a1379ecae 100644 --- a/agbenchmark/tests/regression/RegressionManager.py +++ b/agbenchmark/tests/regression/RegressionManager.py @@ -1,3 +1,6 @@ +import json + + class RegressionManager: """Abstracts interaction with the regression tests file""" @@ -6,17 +9,21 @@ class RegressionManager: self.load() def load(self) -> None: - with open(self.filename, "r") as f: - self.tests = f.readlines() + try: + with open(self.filename, "r") as f: + self.tests = json.load(f) + except (FileNotFoundError, json.decoder.JSONDecodeError): + self.tests = {} def save(self) -> None: with open(self.filename, "w") as f: - f.writelines(self.tests) + json.dump(self.tests, f, indent=4) - def add_test(self, test_id) -> None: - if f"{test_id}\n" not in self.tests: - self.tests.append(f"{test_id}\n") + def add_test(self, test_name: str, test_details: dict) -> None: + self.tests[test_name] = test_details + self.save() - def remove_test(self, test_id) -> None: - if f"{test_id}\n" in self.tests: - self.tests.remove(f"{test_id}\n") + def remove_test(self, test_name: str) -> None: + if test_name in self.tests: + del self.tests[test_name] + self.save() diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/agbenchmark/tests/regression/regression_tests.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt index 57b94cd7a..8af722f07 100644 --- a/agbenchmark/tests/regression/regression_tests.txt +++ b/agbenchmark/tests/regression/regression_tests.txt @@ -1,3 +1,14 @@ -agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0] -agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0] -agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0] +{ + "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": { + "difficulty": "easy", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file -- cgit v1.2.3 From d6a6e69f2e3ed1cd4bb1715ae737ad50d6b17cb9 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 25 Jun 2023 19:30:04 -0400 Subject: can now put file extensions or names in files data --- agbenchmark/Challenge.py | 22 +++++++++++++++++++++- agbenchmark/challenges/retrieval/r1/r1_test.py | 12 +++++++----- .../basic_abilities/read_file/read_file_test.py | 12 +++++++----- .../basic_abilities/write_file/w_file_data.json | 2 +- .../basic_abilities/write_file/write_file_test.py | 12 +++++++----- agbenchmark/tests/regression/regression_tests.json | 15 ++++++++++++++- 6 files changed, 57 insertions(+), 18 deletions(-) diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 9828a0e9e..d159296b1 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,5 +1,5 @@ import os -from typing import Optional +import glob from agbenchmark.challenges.define_task_types import Ground @@ -14,6 +14,26 @@ class Challenge: with open(workspace_dir, "r") as f: return f.read() + @staticmethod + def open_files(workspace: str, file_patterns: list): + script_dir = os.path.abspath(workspace) + files_contents = [] + + for file_pattern in file_patterns: + # Check if it is a file extension + if file_pattern.startswith("."): + # Find all files with the given extension in the workspace + matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern)) + else: + # Otherwise, it is a specific file + matching_files = [os.path.join(script_dir, file_pattern)] + + for file_path in matching_files: + with open(file_path, "r") as f: + files_contents.append(f.read()) + + return files_contents + @staticmethod def write_to_file(workspace: str, filename: str, content: str): script_dir = os.path.abspath(workspace) diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 489d298fb..2a7d92a71 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge): indirect=True, ) def test_retrieval(self, workspace, current_challenge_data): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 7d14228c8..90946670c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge): ) @pytest.mark.depends(on=data.dependencies) def test_read_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 1d2621081..037c5bd88 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -6,7 +6,7 @@ "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": ["file_to_check.txt"] + "files": [".txt"] }, "mock_func": "basic_write_file_mock", "info": { diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 330128898..187378ff1 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge): ) @pytest.mark.depends(name="test_write_file") def test_write_file(self, workspace): - file = self.open_file(workspace, data.ground.files[0]) + files_contents = self.open_files(workspace, data.ground.files) - score = self.scoring(file, data.ground) + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, data.ground) + print("Your score is:", score) + scores.append(score) - print("You score is:", score) - - assert score + assert 1 in scores diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 9e26dfeeb..c84fc9c99 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1 +1,14 @@ -{} \ No newline at end of file +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "test_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" + } +} \ No newline at end of file -- cgit v1.2.3 From fa0df12439b7beea91a46f08e7f6154900dc1047 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 26 Jun 2023 09:27:20 -0400 Subject: mini agi attempt --- agbenchmark/conftest.py | 46 +++++++++++++--------- agbenchmark/tests/regression/regression_tests.json | 15 +------ agent/agbenchmark_run.py | 27 +++++++++++++ 3 files changed, 55 insertions(+), 33 deletions(-) create mode 100644 agent/agbenchmark_run.py diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 78114c204..b3b69f194 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -7,6 +7,7 @@ import requests from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager from agbenchmark.challenges.define_task_types import ChallengeData +import subprocess @pytest.fixture(scope="module") @@ -42,27 +43,34 @@ def server_response(request, config): else: task = request.param mock_function_name = None - # print(f"Server starting at {request.module}") - # try: - # response = requests.post( - # f"{config['hostname']}:{config['port']}", data={"task": task} - # ) - # response.raise_for_status() # This will raise an HTTPError if the status is 4xx or 5xx - # except RequestException: - # # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock - - if mock_function_name: - mock_manager = MockManager( - task - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_function_name) - mock_manager.delegate(mock_function_name) - else: - print("No mock provided") + # get the current file's directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + + # construct the script's path + script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py") + + # form the command + command = ["python", script_path, task] + + # if mock_function_name: + # mock_manager = MockManager( + # task + # ) # workspace doesn't need to be passed in, stays the same + # print("Server unavailable, using mock", mock_function_name) + # mock_manager.delegate(mock_function_name) # else: - # # This code is run if no exception occurred - # print(f"Request succeeded with status code {response.status_code}") + # print("No mock provided") + + try: + # run the command and wait for it to complete + result = subprocess.run( + command, shell=True, check=True, text=True, capture_output=True + ) + return result + except subprocess.CalledProcessError as e: + print(f"Subprocess failed with the following error:\n{e}") + # If the subprocess returns a non-zero exit status regression_json = "agbenchmark/tests/regression/regression_tests.json" diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index c84fc9c99..9e26dfeeb 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1,14 +1 @@ -{ - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "test_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py new file mode 100644 index 000000000..f509f5e66 --- /dev/null +++ b/agent/agbenchmark_run.py @@ -0,0 +1,27 @@ +import argparse +import subprocess +import os + + +def main(objective): + # get the current directory + current_dir = os.path.dirname(os.path.abspath(__file__)) + + # form the command + command = ( + f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}" + ) + + # run the command + subprocess.run(command, shell=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.") + parser.add_argument( + "objective", type=str, help="The objective to pass to miniagi.py" + ) + + args = parser.parse_args() + + main(args.objective) -- cgit v1.2.3 From f933717d8b6f28e268437e000a57e187076287af Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 27 Jun 2023 18:17:54 -0400 Subject: mini-agi, simple challenge creation, --mock flag --- .env.example | 4 + README.md | 2 +- agbenchmark/Challenge.py | 53 ++++++++++- agbenchmark/challenges/define_task_types.py | 12 ++- agbenchmark/challenges/retrieval/r1/r1_data.json | 12 ++- agbenchmark/challenges/retrieval/r1/r1_test.py | 24 ++--- agbenchmark/config.json | 2 +- agbenchmark/conftest.py | 103 ++++++++++++++------- agbenchmark/start_benchmark.py | 20 +++- .../tests/basic_abilities/BasicChallenge.py | 2 + .../basic_abilities/read_file/r_file_data.json | 7 +- .../basic_abilities/read_file/read_file_test.py | 39 +++----- .../basic_abilities/write_file/w_file_data.json | 8 +- .../basic_abilities/write_file/write_file_test.py | 26 ++---- agbenchmark/tests/regression/regression_tests.json | 15 ++- agbenchmark/tests/regression/regression_tests.txt | 14 --- agent/agbenchmark_run.py | 27 ------ poetry.lock | 16 +++- pyproject.toml | 3 +- 19 files changed, 233 insertions(+), 156 deletions(-) create mode 100644 .env.example delete mode 100644 agbenchmark/tests/regression/regression_tests.txt delete mode 100644 agent/agbenchmark_run.py diff --git a/.env.example b/.env.example new file mode 100644 index 000000000..0a91118a9 --- /dev/null +++ b/.env.example @@ -0,0 +1,4 @@ +OPENAI_API_KEY= +AGENT_NAME=mini-agi +AGENT_TIMEOUT=60 +MOCK_TEST=False \ No newline at end of file diff --git a/README.md b/README.md index 0ad0cf345..794279478 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ class TestSomething(CategoryChallenge): """Testing if LLM can read a file""" @pytest.mark.parametrize( - "server_response", + "run_agent", [(data.task, data.mock_func)], indirect=True, ) diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index d159296b1..f644abc4a 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -1,12 +1,63 @@ import os import glob +import pytest +from abc import ABC, abstractmethod from agbenchmark.challenges.define_task_types import Ground +from agbenchmark.challenges.define_task_types import ChallengeData +from dotenv import load_dotenv, set_key +load_dotenv() -class Challenge: +mock_test_str = os.getenv("MOCK_TEST") +MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False + + +class Challenge(ABC): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" + @abstractmethod + def get_file_path(self) -> str: + """This should be implemented by any class which inherits from BasicChallenge""" + pass + + @property + def data(self) -> ChallengeData: + return ChallengeData.deserialize(self.get_file_path()) + + @property + def mock(self): + return self.data.mock.mock_func if self.data.mock else None + + @property + def task(self): + return ( + self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task + ) + + @property + def dependencies(self) -> list: + print("self.data.dependencies", self.data.dependencies) + return self.data.dependencies + + @property + def name(self) -> str: + print("self.data.name", self.data.name) + return self.data.name + + @pytest.mark.parametrize( + "run_agent", + [(task, mock)], + indirect=True, + ) + @pytest.mark.parametrize( + "challenge_data", + [data], + indirect=True, + ) + def test_method(self, workspace): + raise NotImplementedError + @staticmethod def open_file(workspace: str, filename: str): script_dir = os.path.abspath(workspace) diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 694671218..7fc2361b5 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,6 +4,11 @@ import json import os +class Mock(BaseModel): + mock_func: str + mock_task: Optional[str] = None + + class Info(BaseModel): difficulty: str description: str @@ -12,17 +17,18 @@ class Info(BaseModel): class Ground(BaseModel): answer: str - should_contain: Optional[List[str]] - should_not_contain: Optional[List[str]] + should_contain: Optional[List[str]] = None + should_not_contain: Optional[List[str]] = None files: List[str] class ChallengeData(BaseModel): + name: str category: List[str] task: str dependencies: List[str] ground: Ground - mock_func: Optional[str] = None + mock: Optional[Mock] = None info: Info def serialize(self, path: str) -> None: diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 562d1c364..80c5e51eb 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,16 +1,20 @@ { + "name": "retrieval1", "category": ["basic"], - "task": "What is the capital of America?", + "task": "Print the the capital of America to a .txt file", "dependencies": [], "ground": { "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": ["file_to_check.txt"] + "files": [".txt"] + }, + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" }, - "mock_func": "basic_write_file_mock", "info": { - "difficulty": "easy", + "difficulty": "basic", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 2a7d92a71..0bd907d8a 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -4,30 +4,18 @@ from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "r1_data.json") -) - - class TestRetrieval1(RetrievalChallenge): """The first information-retrieval challenge""" - @pytest.mark.parametrize( - "server_response", - [(data.task, data.mock_func)], - indirect=True, - ) - @pytest.mark.parametrize( - "regression_data", - [data], - indirect=True, - ) - def test_retrieval(self, workspace, current_challenge_data): - files_contents = self.open_files(workspace, data.ground.files) + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "r1_data.json") + + def test_method(self, workspace): + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: - score = self.scoring(file_content, data.ground) + score = self.scoring(file_content, self.data.ground) print("Your score is:", score) scores.append(score) diff --git a/agbenchmark/config.json b/agbenchmark/config.json index d285627e5..9e5c1880f 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,5 @@ { "hostname": "localhost", "port": 8080, - "workspace": "agbenchmark/mocks/workspace" + "workspace": "C:/Users/silen/miniagi" } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index b3b69f194..4edd4b5e0 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -4,18 +4,24 @@ import pytest import shutil from agbenchmark.tests.regression.RegressionManager import RegressionManager import requests -from requests.exceptions import RequestException from agbenchmark.mocks.MockManager import MockManager -from agbenchmark.challenges.define_task_types import ChallengeData import subprocess +from agbenchmark.Challenge import Challenge +from dotenv import load_dotenv + +load_dotenv() @pytest.fixture(scope="module") -def config(): +def config(request): config_file = os.path.abspath("agbenchmark/config.json") print(f"Config file: {config_file}") with open(config_file, "r") as f: config = json.load(f) + + if request.config.getoption("--mock"): + config["workspace"] = "agbenchmark/mocks/workspace" + return config @@ -34,43 +40,49 @@ def workspace(config): print(f"Failed to delete {file_path}. Reason: {e}") +def pytest_addoption(parser): + parser.addoption("--mock", action="store_true", default=False) + + +AGENT_NAME = os.getenv("AGENT_NAME") +AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT") + + @pytest.fixture(autouse=True) -def server_response(request, config): +def run_agent(request, config): """Calling to get a response""" if isinstance(request.param, tuple): task = request.param[0] # The task is passed in indirectly - mock_function_name = request.param[1] + mock_function_name = request.param[1] or None else: task = request.param mock_function_name = None - # get the current file's directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # construct the script's path - script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py") - - # form the command - command = ["python", script_path, task] + if mock_function_name != None and (request.config.getoption("--mock")): + if mock_function_name: + mock_manager = MockManager( + task + ) # workspace doesn't need to be passed in, stays the same + print("Server unavailable, using mock", mock_function_name) + mock_manager.delegate(mock_function_name) + else: + print("No mock provided") + else: + path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - # if mock_function_name: - # mock_manager = MockManager( - # task - # ) # workspace doesn't need to be passed in, stays the same - # print("Server unavailable, using mock", mock_function_name) - # mock_manager.delegate(mock_function_name) - # else: - # print("No mock provided") + try: + timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60 - try: - # run the command and wait for it to complete - result = subprocess.run( - command, shell=True, check=True, text=True, capture_output=True - ) - return result - except subprocess.CalledProcessError as e: - print(f"Subprocess failed with the following error:\n{e}") - # If the subprocess returns a non-zero exit status + subprocess.run( + ["python", "miniagi.py", task], + check=True, + cwd=path, + timeout=timeout + # text=True, + # capture_output=True + ) + except subprocess.TimeoutExpired: + print("The subprocess has exceeded the time limit and was terminated.") regression_json = "agbenchmark/tests/regression/regression_tests.json" @@ -80,13 +92,13 @@ regression_manager = RegressionManager(regression_json) # this is to get the challenge_data from every test @pytest.fixture(autouse=True) -def regression_data(request): +def challenge_data(request): return request.param def pytest_runtest_makereport(item, call): if call.when == "call": - challenge_data = item.funcargs.get("regression_data", None) + challenge_data = item.funcargs.get("challenge_data", None) difficulty = challenge_data.info.difficulty if challenge_data else "unknown" dependencies = challenge_data.dependencies if challenge_data else [] @@ -105,9 +117,9 @@ def pytest_runtest_makereport(item, call): def pytest_collection_modifyitems(items): """Called once all test items are collected. Used - to add regression marker to collected test items.""" + to add regression and depends markers to collected test items.""" for item in items: - print("pytest_collection_modifyitems", item.nodeid) + # regression add if item.nodeid.split("::")[1] in regression_manager.tests: print(regression_manager.tests) item.add_marker(pytest.mark.regression) @@ -116,3 +128,26 @@ def pytest_collection_modifyitems(items): def pytest_sessionfinish(): """Called at the end of the session to save regression tests""" regression_manager.save() + + +# this is so that all tests can inherit from the Challenge class +def pytest_generate_tests(metafunc): + if "challenge_data" in metafunc.fixturenames: + # Get the instance of the test class + test_class = metafunc.cls() + + # Generate the parameters + params = test_class.data + + # Add the parameters to the test function + metafunc.parametrize("challenge_data", [params], indirect=True) + + if "run_agent" in metafunc.fixturenames: + # Get the instance of the test class + test_class = metafunc.cls() + + # Generate the parameters + params = [(test_class.task, test_class.mock)] + + # Add the parameters to the test function + metafunc.parametrize("run_agent", params, indirect=True) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 6adcc09bf..ac612293a 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -2,6 +2,10 @@ import click import pytest import json import os +from pathlib import Path +from dotenv import load_dotenv, set_key + +load_dotenv() @click.group() @@ -12,8 +16,8 @@ def cli(): @cli.command() @click.option("--category", default=None, help="Specific category to run") @click.option("--noreg", is_flag=True, help="Skip regression tests") -def start(category, noreg): - """Start the benchmark tests. If a category flag is is provided, run the categories with that mark.""" +@click.option("--mock", is_flag=True, help="Run with mock") +def start(category, noreg, mock): """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" config_file = "agbenchmark/config.json" @@ -28,7 +32,8 @@ def start(category, noreg): ) config["port"] = click.prompt("Please enter a new port", default=8080) config["workspace"] = click.prompt( - "Please enter a new workspace path", default="agbenchmark/mocks/workspace" + "Please enter a new workspace path", + default=os.path.join(Path.home(), "miniagi"), ) with open(config_dir, "w") as f: @@ -38,13 +43,17 @@ def start(category, noreg): with open(config_dir, "r") as f: config = json.load(f) + set_key(".env", "MOCK_TEST", "True" if mock else "False") + if mock: + config["workspace"] = "agbenchmark/mocks/workspace" + # create workspace directory if it doesn't exist workspace_path = os.path.abspath(config["workspace"]) if not os.path.exists(workspace_path): os.makedirs(workspace_path, exist_ok=True) regression_path = os.path.abspath( - "agbenchmark/tests/regression/regression_tests.txt" + "agbenchmark/tests/regression/regression_tests.json" ) if not os.path.exists(regression_path): with open(regression_path, "a"): @@ -74,6 +83,9 @@ def start(category, noreg): else: print("Running all categorys") # run all categorys + if mock: + pytest_args.append("--mock") + # Run pytest with the constructed arguments pytest.main(pytest_args) diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py index 563207405..6e7f73100 100644 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py @@ -1,5 +1,7 @@ import pytest from agbenchmark.Challenge import Challenge +from agbenchmark.challenges.define_task_types import ChallengeData +from abc import abstractmethod @pytest.mark.basic diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index 8c5ef62db..b21e2724b 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -1,13 +1,16 @@ { + "name": "basic_read_file", "category": ["basic"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["test_write_file"], + "dependencies": ["basic_write_file"], "ground": { "answer": "random string: this is how we're doing", "should_contain": ["random string: this is how we're doing"], "files": ["file_to_check.txt"] }, - "mock_func": "basic_read_file_mock", + "mock": { + "mock_func": "basic_read_file_mock" + }, "info": { "description": "This reads the file quickly", "difficulty": "basic", diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 90946670c..68288a42c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -4,39 +4,30 @@ from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "r_file_data.json") -) +class TestReadFile(BasicChallenge): + """Testing if LLM can read a file""" -@pytest.fixture(scope="module", autouse=True) -def setup_module(workspace): - if data.ground.should_contain: + @pytest.fixture( + scope="module", autouse=True + ) # this is specific to setting up a file for the test, not all tests have this + def setup_module(self, workspace): Challenge.write_to_file( - workspace, data.ground.files[0], "this is how we're doing" + workspace, self.data.ground.files[0], "this is how we're doing" ) + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "r_file_data.json") -class TestReadFile(BasicChallenge): - """Testing if LLM can read a file""" - - @pytest.mark.parametrize( - "server_response", - [(data.task, data.mock_func)], - indirect=True, - ) - @pytest.mark.parametrize( - "regression_data", - [data], - indirect=True, - ) - @pytest.mark.depends(on=data.dependencies) - def test_read_file(self, workspace): - files_contents = self.open_files(workspace, data.ground.files) + @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") + def test_method( + self, workspace + ): # run_test is a common name that all tests must implement + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: - score = self.scoring(file_content, data.ground) + score = self.scoring(file_content, self.data.ground) print("Your score is:", score) scores.append(score) diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 037c5bd88..358ebb538 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -1,6 +1,7 @@ { + "name": "basic_write_file", "category": ["basic"], - "task": "What is the capital of America?", + "task": "Print the the capital of America to a .txt file", "dependencies": [], "ground": { "answer": "Washington", @@ -8,7 +9,10 @@ "should_not_contain": ["New York", "Los Angeles", "San Francisco"], "files": [".txt"] }, - "mock_func": "basic_write_file_mock", + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" + }, "info": { "difficulty": "basic", "description": "Tests the writing to file", diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 187378ff1..8caa6605a 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -3,31 +3,21 @@ from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "w_file_data.json") -) - class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" - @pytest.mark.parametrize( - "server_response", - [(data.task, data.mock_func)], - indirect=True, - ) - @pytest.mark.parametrize( - "regression_data", - [data], - indirect=True, - ) - @pytest.mark.depends(name="test_write_file") - def test_write_file(self, workspace): - files_contents = self.open_files(workspace, data.ground.files) + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "w_file_data.json") + + @pytest.mark.depends(on=[], name="basic_write_file") + def test_method(self, workspace): + print("my workspace is ", workspace) + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: - score = self.scoring(file_content, data.ground) + score = self.scoring(file_content, self.data.ground) print("Your score is:", score) scores.append(score) diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 9e26dfeeb..8a6278fea 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1 +1,14 @@ -{} \ No newline at end of file +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "basic_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" + } +} \ No newline at end of file diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt deleted file mode 100644 index 8af722f07..000000000 --- a/agbenchmark/tests/regression/regression_tests.txt +++ /dev/null @@ -1,14 +0,0 @@ -{ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": { - "difficulty": "easy", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]" - }, - "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": { - "difficulty": "basic", - "dependencies": [ - "test_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]" - } -} \ No newline at end of file diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py deleted file mode 100644 index f509f5e66..000000000 --- a/agent/agbenchmark_run.py +++ /dev/null @@ -1,27 +0,0 @@ -import argparse -import subprocess -import os - - -def main(objective): - # get the current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # form the command - command = ( - f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}" - ) - - # run the command - subprocess.run(command, shell=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.") - parser.add_argument( - "objective", type=str, help="The objective to pass to miniagi.py" - ) - - args = parser.parse_args() - - main(args.objective) diff --git a/poetry.lock b/poetry.lock index d7939fbfe..7b2477bc6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -644,6 +644,20 @@ future-fstrings = "*" networkx = "*" pytest = ">=3" +[[package]] +name = "python-dotenv" +version = "1.0.0" +description = "Read key-value pairs from a .env file and set them as environment variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, + {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, +] + +[package.extras] +cli = ["click (>=5.0)"] + [[package]] name = "requests" version = "2.31.0" @@ -814,4 +828,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c" +content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe" diff --git a/pyproject.toml b/pyproject.toml index 0a4f8ba73..043fe68a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" +python-dotenv = "^1.0.0" [build-system] @@ -30,7 +31,7 @@ testpaths = [ markers = [ "retrieval", "regression", - "basic" + "basic", ] [tool.poetry.scripts] -- cgit v1.2.3 From 76ee994d2c7a205799bc7c07adfa70f0c93102e9 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 27 Jun 2023 19:19:14 -0400 Subject: read mes, remove port and host from config, etc --- .env.example | 1 - README.md | 158 +++++++-------------- agbenchmark/challenges/README.md | 31 ++-- agbenchmark/config.json | 4 +- agbenchmark/mocks/basic_gpt_agent.py | 20 --- agbenchmark/mocks/tests/basic_mocks.py | 12 +- agbenchmark/start_benchmark.py | 4 - .../basic_abilities/read_file/read_file_test.py | 5 +- .../basic_abilities/write_file/write_file_test.py | 1 - agbenchmark/tests/regression/regression_tests.json | 7 - 10 files changed, 75 insertions(+), 168 deletions(-) delete mode 100644 agbenchmark/mocks/basic_gpt_agent.py diff --git a/.env.example b/.env.example index 0a91118a9..7782d048e 100644 --- a/.env.example +++ b/.env.example @@ -1,4 +1,3 @@ -OPENAI_API_KEY= AGENT_NAME=mini-agi AGENT_TIMEOUT=60 MOCK_TEST=False \ No newline at end of file diff --git a/README.md b/README.md index 794279478..2c8daa0ad 100644 --- a/README.md +++ b/README.md @@ -2,131 +2,94 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work +## As a user + +1. `pip install auto-gpt-benchmarks` +2. Add boilerplate code to run and kill agent +3. `agbenchmark start` + - `--category challenge_category` to run tests in a specific category + - `--mock` to only run mock tests if they exists for each test + - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests +4. We call boilerplate code for your agent +5. Show pass rate of tests, logs, and any other metrics + +## Contributing + ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x -### To run the basic existing mock (June 21) +### To run the existing mocks 1. clone the repo `auto-gpt-benchmarks` 2. `pip install poetry` 3. `poetry shell` 4. `poetry install` -5. `agbenchmark start` +5. `cp .env_example .env` +6. `agbenchmark start --mock` Keep config the same and watch the logs :) +### To run with mini-agi + +1. Navigate to `auto-gpt-benchmarks/agent/mini-agi` +2. `pip install -r requirements.txt` +3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed +4. Make sure to follow the commands above, and remove mock flag `agbenchmark start` + - To add requirements `poetry add requirement`. Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. -If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit +If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit Let people know what beautiful code you write does, document everything well Share your progress :) -## How this works - -1. `pip install auto-gpt-benchmarks` -2. Add boilerplate code to start webserver to your agent (run loop and stop condition) -3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory -4. We call the server to run the agent for each test -5. Show pass rate of tests, logs, and any other metrics - -### To run the basic existing mock (June 21) - -1. clone the repo `auto-gpt-benchmarks` -2. `pip install poetry` -3. `poetry shell` -4. `poetry install` -5. `agbenchmark start` - Keep config the same and watch the logs :) - -#### Bonuses - -- You can adds tests by git cloning auto-gpt-benchmarks to your repo -- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server -- Simple, easy to use -- Don't have to deal with cloud or parallelization yet - ### Pytest -to create a test: +an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic ```python import pytest -from agbenchmark.challenges.define_task_types import ChallengeData -from ..CategoryChallenge import CategoryChallenge +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os -data = ChallengeData.deserialize( - os.path.join(os.path.dirname(__file__), "r_file_data.json") -) - -class TestSomething(CategoryChallenge): - """Testing if LLM can read a file""" - - @pytest.mark.parametrize( - "run_agent", - [(data.task, data.mock_func)], - indirect=True, - ) - def test_retrieval( - self, workspace - ): - # scoring logic goes here -``` - -All challenges will inherit from parent class which has the mark - -```python -@pytest.mark.basic -class BasicChallenge(Challenge): - pass -``` - -If you want to add a custom mark to a Challenge, you must specify it before the test definition -```python -@pytest.mark.other_mark -def test_retrieval(self, workspace): -``` +class TestWriteFile(BasicChallenge): + """Testing if LLM can write to a file""" -To add a dependency to a challenge use the following + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "w_file_data.json") -```python -# to defining what a test depends on -from pytest_dependency import depends - -def test1(self, request, workspace): - depends(request, data.dependencies) -# for defining a test as a dependency -@pytest.mark.dependency() -def test2 + @pytest.mark.depends(on=[], name="basic_write_file") + def test_method(self, workspace): + # implement scoring logic by looking at workspace ``` -Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards +All challenges will inherit from parent class which has the mark and any specific methods for their category ```python -@pytest.mark.run(order=1) +@pytest.mark.basic +class BasicChallenge(Challenge): + pass ``` To create a file to test a challenge, add this to the challenge file which will create a file before running the server ```python -@pytest.fixture(scope="module", autouse=True) -def setup_module(workspace): - if data.ground.should_contain: +@pytest.fixture( + scope="module", autouse=True + ) # this is specific to setting up a file for the test, not all tests have this + def setup_module(self, workspace): Challenge.write_to_file( - workspace, data.ground.files[0], "this is how we're doing" + workspace, self.data.ground.files[0], "this is how we're doing" ) ``` -## Api - -FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server +#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) ## Workspace -Defined by the user on config +If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config #### Dataset @@ -138,9 +101,9 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git |-- auto-gpt-benchmarks/ **main project directory** | |-- metrics.py **combining scores, metrics, final evaluation** | |-- start_benchmark.py **entry point from cli** -| |-- conftest.py **shared fixtures across all tests** -| |-- Challenge.py **easy challenge creation class?** -| |-- config.json **hostname, port, workspace folder** +| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** +| |-- Challenge.py **easy challenge creation class** +| |-- config.json **workspace folder** | |-- challenges/ **challenges across different domains** | | |-- adaptability/ | | |-- basic_abilities/ @@ -149,28 +112,7 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git | | |-- retrieval/ | | |-- web_navigation/ | | |-- writing/ -| |-- tests/ **challenges across different metrics** -| | |-- basic_abilities/ -| | |-- interface/ -| |-- workspace/ **workspace related func** -| | |-- **init**.py -| | |-- workspace_manager.py **creation, deletion** +| |-- tests/ +| | |-- basic_abilities/ **every llm should pass these challenges** +| | |-- regression/ **challenges that already passed** ``` - -### Easy Challenge Creation - -tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval - -#### Written Challenges - -For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore - -#### Validators - -Designed to handle specific types of output (e.g., text, code, structured data) - -#### Logging - -Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc - -Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index e457b85c4..9e74d19ce 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -4,7 +4,8 @@ Input: -- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ +- **name** (str): Name of the challenge. +- **category** (str[]): Category of the challenge such as 'basic', 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_ - **task** (str): The task that the agent needs to solve. - **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function. - **ground** (dict): The ground truth. @@ -12,7 +13,9 @@ Input: - **should_contain** (list): The exact strings that are required in the final answer. - **should_not_contain** (list): The exact strings that should not be in the final answer. - **files** (list): Files that are used for retrieval. Can specify file here or an extension. -- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. +- **mock** (dict): Mock response for testing. + - **mock_func** (str): Function to mock the agent's response. This is used for testing purposes. + - **mock_task** (str): Task to provide for the mock function. - **info** (dict): Additional info about the challenge. - **difficulty** (str): The difficulty of this query. - **description** (str): Description of the challenge. @@ -22,24 +25,26 @@ Example: ```python { + "name": "basic_write_file", "category": ["basic"], - "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": [ - "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file" - ], + "task": "Print the the capital of America to a .txt file", + "dependencies": [], "ground": { - "answer": "random string: this is how we're doing", - "should_contain": ["random string: this is how we're doing"], - "files": ["file_to_check.txt"] + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": [".txt"] + }, + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" }, - "mock_func": "basic_read_file_mock", "info": { - "description": "This reads the file quickly", "difficulty": "basic", - "side_effects": [""] + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] } } - ``` Current Output: diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 9e5c1880f..3de1dd643 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,3 @@ { - "hostname": "localhost", - "port": 8080, - "workspace": "C:/Users/silen/miniagi" + "hostname": "localhost" } diff --git a/agbenchmark/mocks/basic_gpt_agent.py b/agbenchmark/mocks/basic_gpt_agent.py deleted file mode 100644 index 6aac3d191..000000000 --- a/agbenchmark/mocks/basic_gpt_agent.py +++ /dev/null @@ -1,20 +0,0 @@ -import json -import openai - - -def basic_gpt_agent(query) -> str: - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}] - ) - - answer = response["choices"][0]["message"]["content"] # type: ignore - - print("QUERY : ", query) - print("AGENT ANSWER: ", answer) - - return answer - - -if __name__ == "__main__": - # server boilerplate example here - basic_gpt_agent("") diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 550095b72..631b30c2c 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -1,5 +1,4 @@ from agbenchmark.Challenge import Challenge -from ..basic_gpt_agent import basic_gpt_agent def basic_read_file_mock(task: str, workspace: str): @@ -18,9 +17,8 @@ def basic_write_file_mock(task: str, workspace: str): """ This mock writes to a file (creates one if it doesn't exist) """ - - # Call the basic_gpt_agent to get a response. - response = basic_gpt_agent(task) - - # Open the file in write mode. - Challenge.write_to_file(workspace, "file_to_check.txt", response) + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "Washington DC is the capital of the United States of America", + ) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index ac612293a..c9f3643cc 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -27,10 +27,6 @@ def start(category, noreg, mock): if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0: config = {} - config["hostname"] = click.prompt( - "\nPlease enter a new hostname", default="localhost" - ) - config["port"] = click.prompt("Please enter a new port", default=8080) config["workspace"] = click.prompt( "Please enter a new workspace path", default=os.path.join(Path.home(), "miniagi"), diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 68288a42c..f99ae608c 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,5 +1,4 @@ import pytest -from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.Challenge import Challenge from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os @@ -8,9 +7,7 @@ import os class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" - @pytest.fixture( - scope="module", autouse=True - ) # this is specific to setting up a file for the test, not all tests have this + @pytest.fixture(scope="module", autouse=True) def setup_module(self, workspace): Challenge.write_to_file( workspace, self.data.ground.files[0], "this is how we're doing" diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 8caa6605a..39c73b163 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,5 +1,4 @@ import pytest -from agbenchmark.challenges.define_task_types import ChallengeData from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 8a6278fea..384f9e7c6 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -3,12 +3,5 @@ "difficulty": "basic", "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "basic_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" } } \ No newline at end of file -- cgit v1.2.3 From 0c81585a538facff2b62c22d5b896df00cea9c17 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 27 Jun 2023 22:17:42 -0400 Subject: Update README.md (#41) --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 2c8daa0ad..504132ddb 100644 --- a/README.md +++ b/README.md @@ -73,8 +73,7 @@ class BasicChallenge(Challenge): pass ``` -To create a file to test a challenge, add this to the challenge file which will create a file before running the server - +Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. ```python @pytest.fixture( scope="module", autouse=True -- cgit v1.2.3 From ac5af736963dac95969f0cb3d0f99480a0a4f401 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Wed, 28 Jun 2023 21:28:46 -0400 Subject: trying to get kill process --- agbenchmark/config.json | 8 +- agbenchmark/conftest.py | 70 ++++++++++++---- agbenchmark/tests/regression/regression_tests.json | 7 ++ poetry.lock | 93 +++++++++++++++++++++- pyproject.toml | 2 + 5 files changed, 161 insertions(+), 19 deletions(-) diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 3de1dd643..d95b8e443 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,3 +1,9 @@ { - "hostname": "localhost" + "workspace": "C:\\Users\\silen\\miniagi", + "cutoff": { + "type": "time", + "user_prompt": "Press enter to continue or abort this action by typing feedback:", + "user_input": "\n", + "count": 5 + } } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 4edd4b5e0..2590ce781 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -2,11 +2,10 @@ import json import os import pytest import shutil +import subprocess +import sys from agbenchmark.tests.regression.RegressionManager import RegressionManager -import requests from agbenchmark.mocks.MockManager import MockManager -import subprocess -from agbenchmark.Challenge import Challenge from dotenv import load_dotenv load_dotenv() @@ -44,8 +43,16 @@ def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) +def check_cycle_count(cycle_count: int, cutoff: int, proc): + """Increment, print, and check cycle count.""" + cycle_count += 1 + print(f"Cycle count: {cycle_count}") + if cycle_count >= cutoff: + proc.terminate(force=True) + return cycle_count + + AGENT_NAME = os.getenv("AGENT_NAME") -AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT") @pytest.fixture(autouse=True) @@ -70,19 +77,48 @@ def run_agent(request, config): else: path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - try: - timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60 - - subprocess.run( - ["python", "miniagi.py", task], - check=True, - cwd=path, - timeout=timeout - # text=True, - # capture_output=True - ) - except subprocess.TimeoutExpired: - print("The subprocess has exceeded the time limit and was terminated.") + timeout = sys.maxsize + + if config["cutoff"]["type"] == "time": + timeout = config["cutoff"]["count"] or 60 + + from pexpect.popen_spawn import PopenSpawn + + print(f"Running {task} with timeout {timeout}") + + # Starting the subprocess using pexpect + proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) + + print("proc", proc) + + cycle_count = 0 + + while True: + try: + # If we get the prompt for user input, we send "\n" + if config["cutoff"]["type"] == "user_input": + proc.expect([config["cutoff"]["user_prompt"]]) + proc.sendline(config["cutoff"]["user_input"]) + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + elif config["cutoff"]["type"] == "cycle_count": + match = proc.expect([r"Cycle count: (\d+)"]) + if match is not None: + cycle_count = int(match.group(1)) # type: ignore + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + + # for cutoff type "time", just let it run until timeout + except expect.TIMEOUT: + print("The subprocess has exceeded the time limit and was terminated.") + break + except expect.EOF: + print("The subprocess has finished running.") + break + + proc.close() regression_json = "agbenchmark/tests/regression/regression_tests.json" diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 384f9e7c6..8a6278fea 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -3,5 +3,12 @@ "difficulty": "basic", "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "basic_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" } } \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 7b2477bc6..a460f988d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -538,6 +538,20 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pexpect" +version = "4.8.0" +description = "Pexpect allows easy control of interactive console applications." +optional = false +python-versions = "*" +files = [ + {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, + {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + [[package]] name = "pluggy" version = "1.0.0" @@ -553,6 +567,43 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "psutil" +version = "5.9.5" +description = "Cross-platform lib for process and system monitoring in Python." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"}, + {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"}, + {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"}, + {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"}, + {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"}, + {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"}, + {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"}, + {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"}, + {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"}, + {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"}, + {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"}, +] + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +optional = false +python-versions = "*" +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + [[package]] name = "pydantic" version = "1.10.9" @@ -658,6 +709,29 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "pywin32" +version = "306" +description = "Python for Window Extensions" +optional = false +python-versions = "*" +files = [ + {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, + {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, + {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"}, + {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"}, + {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"}, + {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"}, + {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"}, + {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"}, + {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"}, + {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"}, + {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"}, + {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"}, + {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, + {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -738,6 +812,23 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "wexpect" +version = "4.0.0" +description = "Windows alternative of pexpect" +optional = false +python-versions = "*" +files = [ + {file = "wexpect-4.0.0.tar.gz", hash = "sha256:de9e739e78ec4d74a39bf8499904dacb6c594007a674fb7e10752c9b131f6522"}, +] + +[package.dependencies] +psutil = ">=5.0.0" +pywin32 = ">=220" + +[package.extras] +test = ["codecov", "coverage", "pyinstaller", "setuptools (>=38.0)", "tox", "twine"] + [[package]] name = "yarl" version = "1.9.2" @@ -828,4 +919,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe" +content-hash = "8ab722acade739b9fb841ecae3b8cabd4f1d8a355864573a93d9faa11dcffb90" diff --git a/pyproject.toml b/pyproject.toml index 043fe68a2..af9688d14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,8 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^1.0.0" +pexpect = "^4.8.0" +wexpect = "^4.0.0" [build-system] -- cgit v1.2.3 From fce421fb335107cddd9fd60b32e91902be7b5eae Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 29 Jun 2023 20:51:23 -0400 Subject: moving logic to benchmark.py file --- agbenchmark/benchmark.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++ agbenchmark/conftest.py | 61 ++------------------------------------------- 2 files changed, 67 insertions(+), 59 deletions(-) create mode 100644 agbenchmark/benchmark.py diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py new file mode 100644 index 000000000..6dc3b2312 --- /dev/null +++ b/agbenchmark/benchmark.py @@ -0,0 +1,65 @@ +import os +import sys +import pexpect as expect +from dotenv import load_dotenv + +load_dotenv() + + +def check_cycle_count(cycle_count: int, cutoff: int, proc): + """Increment, print, and check cycle count.""" + cycle_count += 1 + print(f"Cycle count: {cycle_count}") + if cycle_count >= cutoff: + proc.terminate(force=True) + return cycle_count + + +AGENT_NAME = os.getenv("AGENT_NAME") + + +def run_agnostic(config, task): + path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") + + timeout = sys.maxsize + + if config["cutoff"]["type"] == "time": + timeout = config["cutoff"]["count"] or 60 + + # from pexpect.popen_spawn import PopenSpawn + + print(f"Running {task} with timeout {timeout}") + + # Starting the subprocess using pexpect + proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) + + print("proc", proc) + + cycle_count = 0 + + while True: + try: + # If we get the prompt for user input, we send "\n" + if config["cutoff"]["type"] == "user_input": + proc.expect([config["cutoff"]["user_prompt"]]) + proc.sendline(config["cutoff"]["user_input"]) + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + elif config["cutoff"]["type"] == "cycle_count": + match = proc.expect([r"Cycle count: (\d+)"]) + if match is not None: + cycle_count = int(match.group(1)) # type: ignore + cycle_count = check_cycle_count( + cycle_count, config["cutoff"]["count"], proc + ) + + # for cutoff type "time", just let it run until timeout + except expect.TIMEOUT: + print("The subprocess has exceeded the time limit and was terminated.") + break + except expect.EOF: + print("The subprocess has finished running.") + break + + proc.close() diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 2590ce781..25510e42b 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,9 +6,7 @@ import subprocess import sys from agbenchmark.tests.regression.RegressionManager import RegressionManager from agbenchmark.mocks.MockManager import MockManager -from dotenv import load_dotenv - -load_dotenv() +from agbenchmark.benchmark import run_agnostic @pytest.fixture(scope="module") @@ -43,18 +41,6 @@ def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) -def check_cycle_count(cycle_count: int, cutoff: int, proc): - """Increment, print, and check cycle count.""" - cycle_count += 1 - print(f"Cycle count: {cycle_count}") - if cycle_count >= cutoff: - proc.terminate(force=True) - return cycle_count - - -AGENT_NAME = os.getenv("AGENT_NAME") - - @pytest.fixture(autouse=True) def run_agent(request, config): """Calling to get a response""" @@ -75,50 +61,7 @@ def run_agent(request, config): else: print("No mock provided") else: - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = sys.maxsize - - if config["cutoff"]["type"] == "time": - timeout = config["cutoff"]["count"] or 60 - - from pexpect.popen_spawn import PopenSpawn - - print(f"Running {task} with timeout {timeout}") - - # Starting the subprocess using pexpect - proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) - - print("proc", proc) - - cycle_count = 0 - - while True: - try: - # If we get the prompt for user input, we send "\n" - if config["cutoff"]["type"] == "user_input": - proc.expect([config["cutoff"]["user_prompt"]]) - proc.sendline(config["cutoff"]["user_input"]) - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - elif config["cutoff"]["type"] == "cycle_count": - match = proc.expect([r"Cycle count: (\d+)"]) - if match is not None: - cycle_count = int(match.group(1)) # type: ignore - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - - # for cutoff type "time", just let it run until timeout - except expect.TIMEOUT: - print("The subprocess has exceeded the time limit and was terminated.") - break - except expect.EOF: - print("The subprocess has finished running.") - break - - proc.close() + run_agnostic(config, task) regression_json = "agbenchmark/tests/regression/regression_tests.json" -- cgit v1.2.3 From 2987d71264c7ffb0b6184e28e17c503aef5b4681 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Fri, 30 Jun 2023 10:50:54 -0400 Subject: moving run agent to tests & agnostic run working --- .env.example | 2 +- agbenchmark/Challenge.py | 16 +-- agbenchmark/agent_interface.py | 108 +++++++++++++++++++++ agbenchmark/benchmark.py | 65 ------------- agbenchmark/challenges/retrieval/r1/r1_test.py | 7 +- agbenchmark/config.json | 9 +- agbenchmark/conftest.py | 37 ------- agbenchmark/mocks/workspace/file_to_check.txt | 1 + .../basic_abilities/read_file/read_file_test.py | 7 +- .../basic_abilities/write_file/write_file_test.py | 6 +- agbenchmark/tests/regression/regression_tests.json | 9 +- agent/hook.py | 10 ++ pyproject.toml | 2 - 13 files changed, 144 insertions(+), 135 deletions(-) create mode 100644 agbenchmark/agent_interface.py delete mode 100644 agbenchmark/benchmark.py create mode 100644 agbenchmark/mocks/workspace/file_to_check.txt create mode 100644 agent/hook.py diff --git a/.env.example b/.env.example index 7782d048e..e50ed58a5 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,3 @@ AGENT_NAME=mini-agi -AGENT_TIMEOUT=60 +ENVIRONMENT=local MOCK_TEST=False \ No newline at end of file diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index f644abc4a..7b1e4df04 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -4,7 +4,7 @@ import pytest from abc import ABC, abstractmethod from agbenchmark.challenges.define_task_types import Ground from agbenchmark.challenges.define_task_types import ChallengeData -from dotenv import load_dotenv, set_key +from dotenv import load_dotenv load_dotenv() @@ -40,22 +40,24 @@ class Challenge(ABC): print("self.data.dependencies", self.data.dependencies) return self.data.dependencies + def setup_challenge(self, config): + from agbenchmark.agent_interface import run_agent + + print("SETTING UP CHALLENGE...") + + run_agent(self.task, self.mock, config) + @property def name(self) -> str: print("self.data.name", self.data.name) return self.data.name - @pytest.mark.parametrize( - "run_agent", - [(task, mock)], - indirect=True, - ) @pytest.mark.parametrize( "challenge_data", [data], indirect=True, ) - def test_method(self, workspace): + def test_method(self, config): raise NotImplementedError @staticmethod diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py new file mode 100644 index 000000000..eba26fc18 --- /dev/null +++ b/agbenchmark/agent_interface.py @@ -0,0 +1,108 @@ +import os +import sys +import subprocess +import time +from agbenchmark.mocks.MockManager import MockManager +from multiprocessing import Process, Pipe + +from agent.hook import run_specific_agent + +from dotenv import load_dotenv + +load_dotenv() + +MOCK_FLAG = os.getenv("MOCK_TEST") + + +def run_agent(task, mock_func, config): + """Calling to get a response""" + + if mock_func == None and MOCK_FLAG == "True": + print("No mock provided") + elif MOCK_FLAG == "True": + mock_manager = MockManager( + task + ) # workspace doesn't need to be passed in, stays the same + print("Server unavailable, using mock", mock_func) + mock_manager.delegate(mock_func) + else: + if config["agent"]["type"] == "python": + run_agent_function(config, task) + elif config["agent"]["type"] == "script": + run_agent_command(config, task) + + +ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" + + +def run_agent_command(config, task): + path = config["agent"]["path"] + + if ENVIRONMENT == "local": + AGENT_NAME = os.getenv("AGENT_NAME") + path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") + + timeout = config["agent"]["cutoff"] or sys.maxsize + print(f"Running {task} with timeout {timeout}") + + command_from_config = config["agent"]["script"] + command_list = command_from_config.split() + + # replace '{}' with the task + command_list = [cmd if cmd != "{}" else task for cmd in command_list] + print("path, command_list", path, command_list) + start_time = time.time() + proc = subprocess.Popen( + command_list, + cwd=path, + shell=True, + ) + + while True: + if time.time() - start_time > timeout: + print("The subprocess has exceeded the time limit and was terminated.") + proc.terminate() + break + + if proc.poll() is not None: + print("The subprocess has finished running.") + break + + +def run_agent_function(config, task): + timeout = ( + config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize + ) + print( + f"Running Python function '{config['agent']['function']}' with timeout {timeout}" + ) + + parent_conn, child_conn = Pipe() + process = Process(target=run_specific_agent, args=(task, child_conn)) + process.start() + start_time = time.time() + + while True: + if parent_conn.poll(): # Check if there's a new message from the child process + response, cycle_count = parent_conn.recv() + print(f"Cycle {cycle_count}: {response}") + + if cycle_count >= config["cutoff"]["count"]: + print( + f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating." + ) + child_conn.send("terminate") + break + + if time.time() - start_time > timeout: + print("The Python function has exceeded the time limit and was terminated.") + child_conn.send( + "terminate" + ) # Send a termination signal to the child process + break + + if not process.is_alive(): + print("The Python function has finished running.") + break + + process.join() diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py deleted file mode 100644 index 6dc3b2312..000000000 --- a/agbenchmark/benchmark.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -import sys -import pexpect as expect -from dotenv import load_dotenv - -load_dotenv() - - -def check_cycle_count(cycle_count: int, cutoff: int, proc): - """Increment, print, and check cycle count.""" - cycle_count += 1 - print(f"Cycle count: {cycle_count}") - if cycle_count >= cutoff: - proc.terminate(force=True) - return cycle_count - - -AGENT_NAME = os.getenv("AGENT_NAME") - - -def run_agnostic(config, task): - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = sys.maxsize - - if config["cutoff"]["type"] == "time": - timeout = config["cutoff"]["count"] or 60 - - # from pexpect.popen_spawn import PopenSpawn - - print(f"Running {task} with timeout {timeout}") - - # Starting the subprocess using pexpect - proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path) - - print("proc", proc) - - cycle_count = 0 - - while True: - try: - # If we get the prompt for user input, we send "\n" - if config["cutoff"]["type"] == "user_input": - proc.expect([config["cutoff"]["user_prompt"]]) - proc.sendline(config["cutoff"]["user_input"]) - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - elif config["cutoff"]["type"] == "cycle_count": - match = proc.expect([r"Cycle count: (\d+)"]) - if match is not None: - cycle_count = int(match.group(1)) # type: ignore - cycle_count = check_cycle_count( - cycle_count, config["cutoff"]["count"], proc - ) - - # for cutoff type "time", just let it run until timeout - except expect.TIMEOUT: - print("The subprocess has exceeded the time limit and was terminated.") - break - except expect.EOF: - print("The subprocess has finished running.") - break - - proc.close() diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 0bd907d8a..b679a731d 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,6 +1,4 @@ -import pytest from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge -from agbenchmark.challenges.define_task_types import ChallengeData, Ground import os @@ -10,8 +8,9 @@ class TestRetrieval1(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") - def test_method(self, workspace): - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/config.json b/agbenchmark/config.json index d95b8e443..7388085dc 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,9 +1,10 @@ { "workspace": "C:\\Users\\silen\\miniagi", - "cutoff": { - "type": "time", - "user_prompt": "Press enter to continue or abort this action by typing feedback:", + "agent": { + "type": "script", + "path": "", + "script": "python miniagi.py {}", "user_input": "\n", - "count": 5 + "cutoff": 60 } } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 25510e42b..0f1fc7bb2 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -2,11 +2,7 @@ import json import os import pytest import shutil -import subprocess -import sys from agbenchmark.tests.regression.RegressionManager import RegressionManager -from agbenchmark.mocks.MockManager import MockManager -from agbenchmark.benchmark import run_agnostic @pytest.fixture(scope="module") @@ -41,29 +37,6 @@ def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) -@pytest.fixture(autouse=True) -def run_agent(request, config): - """Calling to get a response""" - if isinstance(request.param, tuple): - task = request.param[0] # The task is passed in indirectly - mock_function_name = request.param[1] or None - else: - task = request.param - mock_function_name = None - - if mock_function_name != None and (request.config.getoption("--mock")): - if mock_function_name: - mock_manager = MockManager( - task - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_function_name) - mock_manager.delegate(mock_function_name) - else: - print("No mock provided") - else: - run_agnostic(config, task) - - regression_json = "agbenchmark/tests/regression/regression_tests.json" regression_manager = RegressionManager(regression_json) @@ -120,13 +93,3 @@ def pytest_generate_tests(metafunc): # Add the parameters to the test function metafunc.parametrize("challenge_data", [params], indirect=True) - - if "run_agent" in metafunc.fixturenames: - # Get the instance of the test class - test_class = metafunc.cls() - - # Generate the parameters - params = [(test_class.task, test_class.mock)] - - # Add the parameters to the test function - metafunc.parametrize("run_agent", params, indirect=True) diff --git a/agbenchmark/mocks/workspace/file_to_check.txt b/agbenchmark/mocks/workspace/file_to_check.txt new file mode 100644 index 000000000..48dc8cff1 --- /dev/null +++ b/agbenchmark/mocks/workspace/file_to_check.txt @@ -0,0 +1 @@ +Washington DC is the capital of the United States of America \ No newline at end of file diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index f99ae608c..c0aaa7f93 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -17,10 +17,9 @@ class TestReadFile(BasicChallenge): return os.path.join(os.path.dirname(__file__), "r_file_data.json") @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") - def test_method( - self, workspace - ): # run_test is a common name that all tests must implement - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 39c73b163..306375ddd 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -10,9 +10,9 @@ class TestWriteFile(BasicChallenge): return os.path.join(os.path.dirname(__file__), "w_file_data.json") @pytest.mark.depends(on=[], name="basic_write_file") - def test_method(self, workspace): - print("my workspace is ", workspace) - files_contents = self.open_files(workspace, self.data.ground.files) + def test_method(self, config): + self.setup_challenge(config) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index 8a6278fea..d13b763c7 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -2,13 +2,6 @@ "TestWriteFile": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "basic_write_file" - ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]" + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]" } } \ No newline at end of file diff --git a/agent/hook.py b/agent/hook.py new file mode 100644 index 000000000..6fa534180 --- /dev/null +++ b/agent/hook.py @@ -0,0 +1,10 @@ +async def run_specific_agent(task, conn): + while ( + not conn.poll() + ): # Check if there's a termination signal from the main process + response, cycle_count = await run_agent( + task + ) # run the agent and get the response and cycle count + + # Send response and cycle count back to the main process + conn.send((response, cycle_count)) diff --git a/pyproject.toml b/pyproject.toml index af9688d14..043fe68a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,6 @@ openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^1.0.0" -pexpect = "^4.8.0" -wexpect = "^4.0.0" [build-system] -- cgit v1.2.3 From 7c352b745ec90486826289ed735800197e95cd80 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Fri, 30 Jun 2023 11:55:43 -0400 Subject: integrate config, agent_interface just func, hook --- agbenchmark/Challenge.py | 5 +- agbenchmark/agent_interface.py | 118 +++++++-------------- agbenchmark/config.json | 9 +- agbenchmark/start_benchmark.py | 12 ++- agbenchmark/tests/regression/regression_tests.json | 8 +- agent/benchmarks.py | 15 +++ agent/hook.py | 10 -- 7 files changed, 70 insertions(+), 107 deletions(-) create mode 100644 agent/benchmarks.py delete mode 100644 agent/hook.py diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py index 7b1e4df04..d7a2bdc9b 100644 --- a/agbenchmark/Challenge.py +++ b/agbenchmark/Challenge.py @@ -23,6 +23,7 @@ class Challenge(ABC): @property def data(self) -> ChallengeData: + # TODO: make it so that this is cached somewhere to just call self.deserialized_data return ChallengeData.deserialize(self.get_file_path()) @property @@ -37,19 +38,15 @@ class Challenge(ABC): @property def dependencies(self) -> list: - print("self.data.dependencies", self.data.dependencies) return self.data.dependencies def setup_challenge(self, config): from agbenchmark.agent_interface import run_agent - print("SETTING UP CHALLENGE...") - run_agent(self.task, self.mock, config) @property def name(self) -> str: - print("self.data.name", self.data.name) return self.data.name @pytest.mark.parametrize( diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index eba26fc18..2ff2acf30 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,12 +1,9 @@ import os -import sys -import subprocess +import importlib import time from agbenchmark.mocks.MockManager import MockManager from multiprocessing import Process, Pipe -from agent.hook import run_specific_agent - from dotenv import load_dotenv load_dotenv() @@ -26,83 +23,48 @@ def run_agent(task, mock_func, config): print("Server unavailable, using mock", mock_func) mock_manager.delegate(mock_func) else: - if config["agent"]["type"] == "python": - run_agent_function(config, task) - elif config["agent"]["type"] == "script": - run_agent_command(config, task) - - -ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" - - -def run_agent_command(config, task): - path = config["agent"]["path"] - - if ENVIRONMENT == "local": - AGENT_NAME = os.getenv("AGENT_NAME") - path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}") - - timeout = config["agent"]["cutoff"] or sys.maxsize - print(f"Running {task} with timeout {timeout}") - - command_from_config = config["agent"]["script"] - command_list = command_from_config.split() - - # replace '{}' with the task - command_list = [cmd if cmd != "{}" else task for cmd in command_list] - print("path, command_list", path, command_list) - start_time = time.time() - proc = subprocess.Popen( - command_list, - cwd=path, - shell=True, - ) - - while True: - if time.time() - start_time > timeout: - print("The subprocess has exceeded the time limit and was terminated.") - proc.terminate() - break - - if proc.poll() is not None: - print("The subprocess has finished running.") - break - - -def run_agent_function(config, task): - timeout = ( - config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize - ) - print( - f"Running Python function '{config['agent']['function']}' with timeout {timeout}" - ) - - parent_conn, child_conn = Pipe() - process = Process(target=run_specific_agent, args=(task, child_conn)) - process.start() - start_time = time.time() - - while True: - if parent_conn.poll(): # Check if there's a new message from the child process - response, cycle_count = parent_conn.recv() - print(f"Cycle {cycle_count}: {response}") - - if cycle_count >= config["cutoff"]["count"]: + timeout = config["cutoff"] + print(f"Running Python function '{config['func_path']}' with timeout {timeout}") + + parent_conn, child_conn = Pipe() + + # Import the specific agent dynamically + module_name = config["func_path"].replace("/", ".").rstrip(".py") + module = importlib.import_module(module_name) + run_specific_agent = getattr(module, "run_specific_agent") + + process = Process(target=run_specific_agent, args=(task, child_conn)) + process.start() + start_time = time.time() + + while True: + if ( + parent_conn.poll() + ): # Check if there's a new message from the child process + response, cycle_count = parent_conn.recv() + print(f"Cycle {cycle_count}: {response}") + + if cycle_count >= config["cutoff"]: + print( + f"Cycle count has reached the limit of {config['cutoff']}. Terminating." + ) + child_conn.send("terminate") + break + + if time.time() - start_time > timeout: print( - f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating." + "The Python function has exceeded the time limit and was terminated." ) - child_conn.send("terminate") + child_conn.send( + "terminate" + ) # Send a termination signal to the child process break - if time.time() - start_time > timeout: - print("The Python function has exceeded the time limit and was terminated.") - child_conn.send( - "terminate" - ) # Send a termination signal to the child process - break + if not process.is_alive(): + print("The Python function has finished running.") + break - if not process.is_alive(): - print("The Python function has finished running.") - break + process.join() - process.join() + +ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 7388085dc..d9b42ca42 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,10 +1,5 @@ { "workspace": "C:\\Users\\silen\\miniagi", - "agent": { - "type": "script", - "path": "", - "script": "python miniagi.py {}", - "user_input": "\n", - "cutoff": 60 - } + "func_path": "agent/benchmarks.py", + "cutoff": 60 } diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index c9f3643cc..fe395cd21 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -29,7 +29,17 @@ def start(category, noreg, mock): config["workspace"] = click.prompt( "Please enter a new workspace path", - default=os.path.join(Path.home(), "miniagi"), + default=os.path.join(Path.home(), "workspace"), + ) + + config["func_path"] = click.prompt( + "Please enter a the path to your run_specific_agent function implementation", + default="/benchmarks.py", + ) + + config["cutoff"] = click.prompt( + "Please enter a hard cutoff runtime for your agent", + default="60", ) with open(config_dir, "w") as f: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json index d13b763c7..9e26dfeeb 100644 --- a/agbenchmark/tests/regression/regression_tests.json +++ b/agbenchmark/tests/regression/regression_tests.json @@ -1,7 +1 @@ -{ - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]" - } -} \ No newline at end of file +{} \ No newline at end of file diff --git a/agent/benchmarks.py b/agent/benchmarks.py new file mode 100644 index 000000000..eb66412c1 --- /dev/null +++ b/agent/benchmarks.py @@ -0,0 +1,15 @@ +# import subprocess + + +def run_specific_agent(task, conn): + cycle_count = 0 + while ( + not conn.poll() + ): # Check if there's a termination signal from the main process + response = run_agent(task) # run the agent and get the response and cycle count + + if response: + cycle_count += 1 + + # Send response and cycle count back to the main process + conn.send((response, cycle_count)) diff --git a/agent/hook.py b/agent/hook.py deleted file mode 100644 index 6fa534180..000000000 --- a/agent/hook.py +++ /dev/null @@ -1,10 +0,0 @@ -async def run_specific_agent(task, conn): - while ( - not conn.poll() - ): # Check if there's a termination signal from the main process - response, cycle_count = await run_agent( - task - ) # run the agent and get the response and cycle count - - # Send response and cycle count back to the main process - conn.send((response, cycle_count)) -- cgit v1.2.3 From 2062844fa6b0250017ba65712e1a590a5fc28616 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 2 Jul 2023 07:38:30 -0700 Subject: Integrate one challenge to auto gpt (#44) --- .github/workflows/autogpt.yml | 62 +++++++++++++++++++++ .gitignore | 4 +- .gitmodules | 4 ++ agbenchmark/agent_interface.py | 62 ++++++++++----------- agbenchmark/config.json | 4 +- agbenchmark/conftest.py | 21 ++++--- agbenchmark/start_benchmark.py | 64 +++++++++++++--------- .../basic_abilities/write_file/write_file_test.py | 7 ++- agbenchmark/tests/regression/regression_tests.json | 1 - agent/Auto-GPT | 1 + agent/mini-agi | 1 - regression_tests.json | 7 +++ 12 files changed, 164 insertions(+), 74 deletions(-) create mode 100644 .github/workflows/autogpt.yml create mode 100644 .gitmodules delete mode 100644 agbenchmark/tests/regression/regression_tests.json create mode 160000 agent/Auto-GPT delete mode 160000 agent/mini-agi create mode 100644 regression_tests.json diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml new file mode 100644 index 000000000..2b1925117 --- /dev/null +++ b/.github/workflows/autogpt.yml @@ -0,0 +1,62 @@ +name: Auto-GPT Regression Test + +on: + workflow_dispatch: + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ["3.10"] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + python -m venv venv + source venv/bin/activate + poetry install + + - name: Build project + run: | + source venv/bin/activate + poetry build + cd agent/Auto-GPT + pip install -r requirements.txt + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --reg + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.gitignore b/.gitignore index 68bc17f9f..c41065ca4 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,6 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ +.DS_Store +``` diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..b2dc714c5 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "Auto-GPT"] + path = agent/Auto-GPT + url = https://github.com/Significant-Gravitas/Auto-GPT.git + branch = benchmark-integration diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 2ff2acf30..0961dc0f0 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,9 +1,10 @@ -import os import importlib -import time -from agbenchmark.mocks.MockManager import MockManager -from multiprocessing import Process, Pipe +from agbenchmark.mocks.MockManager import MockManager +import os +import sys +import subprocess +import time from dotenv import load_dotenv load_dotenv() @@ -26,45 +27,44 @@ def run_agent(task, mock_func, config): timeout = config["cutoff"] print(f"Running Python function '{config['func_path']}' with timeout {timeout}") - parent_conn, child_conn = Pipe() + # Get the current working directory + cwd = os.getcwd() + + # Add current directory to Python's import path + sys.path.append(cwd) + - # Import the specific agent dynamically module_name = config["func_path"].replace("/", ".").rstrip(".py") module = importlib.import_module(module_name) - run_specific_agent = getattr(module, "run_specific_agent") - process = Process(target=run_specific_agent, args=(task, child_conn)) - process.start() + + command = [sys.executable, "benchmarks.py", str(task)] + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, cwd=cwd) + start_time = time.time() + timeout = config["cutoff"] while True: - if ( - parent_conn.poll() - ): # Check if there's a new message from the child process - response, cycle_count = parent_conn.recv() - print(f"Cycle {cycle_count}: {response}") - - if cycle_count >= config["cutoff"]: - print( - f"Cycle count has reached the limit of {config['cutoff']}. Terminating." - ) - child_conn.send("terminate") - break + output = process.stdout.readline() + print(output.strip()) - if time.time() - start_time > timeout: - print( - "The Python function has exceeded the time limit and was terminated." - ) - child_conn.send( - "terminate" - ) # Send a termination signal to the child process + # Check if process has ended + if process.poll() is not None: + print("The Python function has finished running.") break - if not process.is_alive(): - print("The Python function has finished running.") + # Check if process has exceeded timeout + if time.time() - start_time > timeout: + print("The Python function has exceeded the time limit and was terminated.") + process.terminate() break - process.join() + # Optional: sleep for a while + time.sleep(0.1) + + # Wait for process to terminate, then get return code + process.wait() + ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/config.json b/agbenchmark/config.json index d9b42ca42..e1c5f154b 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,5 @@ { - "workspace": "C:\\Users\\silen\\miniagi", - "func_path": "agent/benchmarks.py", + "workspace": "autogpt/workspace/auto_gpt_workspace", + "func_path": "benchmarks.py", "cutoff": 60 } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 0f1fc7bb2..4284d1ebf 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -1,15 +1,18 @@ import json import os +from pathlib import Path + import pytest import shutil from agbenchmark.tests.regression.RegressionManager import RegressionManager +from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH @pytest.fixture(scope="module") def config(request): - config_file = os.path.abspath("agbenchmark/config.json") - print(f"Config file: {config_file}") - with open(config_file, "r") as f: + + print(f"Config file: {CONFIG_PATH}") + with open(CONFIG_PATH, "r") as f: config = json.load(f) if request.config.getoption("--mock"): @@ -36,10 +39,7 @@ def workspace(config): def pytest_addoption(parser): parser.addoption("--mock", action="store_true", default=False) - -regression_json = "agbenchmark/tests/regression/regression_tests.json" - -regression_manager = RegressionManager(regression_json) +regression_manager = RegressionManager(REGRESSION_TESTS_PATH) # this is to get the challenge_data from every test @@ -53,13 +53,16 @@ def pytest_runtest_makereport(item, call): challenge_data = item.funcargs.get("challenge_data", None) difficulty = challenge_data.info.difficulty if challenge_data else "unknown" dependencies = challenge_data.dependencies if challenge_data else [] - + parts = item.nodeid.split("::")[0].split("/") + agbenchmark_index = parts.index("agbenchmark") + file_path = "/".join(parts[agbenchmark_index:]) test_details = { "difficulty": difficulty, "dependencies": dependencies, - "test": item.nodeid, + "test": file_path, } + print("pytest_runtest_makereport", test_details) if call.excinfo is None: regression_manager.add_test(item.nodeid.split("::")[1], test_details) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index fe395cd21..28b038e9a 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -7,6 +7,13 @@ from dotenv import load_dotenv, set_key load_dotenv() +CURRENT_DIRECTORY = Path(__file__).resolve().parent + +new_path = CURRENT_DIRECTORY / "config.json" + +CONFIG_PATH = str(new_path.resolve()) + +REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json") @click.group() def cli(): @@ -15,16 +22,12 @@ def cli(): @cli.command() @click.option("--category", default=None, help="Specific category to run") -@click.option("--noreg", is_flag=True, help="Skip regression tests") +@click.option("--reg", is_flag=True, help="Runs only regression tests") @click.option("--mock", is_flag=True, help="Run with mock") -def start(category, noreg, mock): +def start(category, reg, mock): """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" - config_file = "agbenchmark/config.json" - - config_dir = os.path.abspath(config_file) - # Check if configuration file exists and is not empty - if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0: + if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} config["workspace"] = click.prompt( @@ -42,11 +45,11 @@ def start(category, noreg, mock): default="60", ) - with open(config_dir, "w") as f: + with open(CONFIG_PATH, "w") as f: json.dump(config, f) else: # If the configuration file exists and is not empty, load it - with open(config_dir, "r") as f: + with open(CONFIG_PATH, "r") as f: config = json.load(f) set_key(".env", "MOCK_TEST", "True" if mock else "False") @@ -58,11 +61,9 @@ def start(category, noreg, mock): if not os.path.exists(workspace_path): os.makedirs(workspace_path, exist_ok=True) - regression_path = os.path.abspath( - "agbenchmark/tests/regression/regression_tests.json" - ) - if not os.path.exists(regression_path): - with open(regression_path, "a"): + + if not os.path.exists(REGRESSION_TESTS_PATH): + with open(REGRESSION_TESTS_PATH, "a"): pass print("Current configuration:") @@ -70,31 +71,40 @@ def start(category, noreg, mock): print(f"{key}: {value}") print("Starting benchmark tests...", category) - pytest_args = ["agbenchmark", "-vs"] + tests_to_run = [] + pytest_args = ["-vs"] if category: pytest_args.extend( ["-m", category] - ) # run categorys that are of a specific marker - if noreg: - pytest_args.extend( - ["-k", "not regression"] - ) # run categorys that are of a specific marker but don't include regression categorys - print(f"Running {'non-regression' + category if noreg else category} categorys") + ) else: - if noreg: - print("Running all non-regression categorys") - pytest_args.extend( - ["-k", "not regression"] - ) # run categorys that are not regression categorys + if reg: + print("Running all regression tests") + tests_to_run = get_regression_tests() else: - print("Running all categorys") # run all categorys + print("Running all categories") if mock: pytest_args.append("--mock") # Run pytest with the constructed arguments + if not tests_to_run: + tests_to_run = [str(CURRENT_DIRECTORY)] + pytest_args.extend(tests_to_run) pytest.main(pytest_args) +def get_regression_tests(): + if not Path(REGRESSION_TESTS_PATH).exists(): + with open(REGRESSION_TESTS_PATH, 'w') as file: + json.dump({}, file) + + with open(REGRESSION_TESTS_PATH, 'r') as file: + data = json.load(file) + + regression_tests = [str(CURRENT_DIRECTORY / ".." / value['test']) for key, value in data.items()] + + return regression_tests + if __name__ == "__main__": start() diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 306375ddd..8d3eb5404 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,3 +1,5 @@ +from pathlib import Path + import pytest from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os @@ -9,10 +11,11 @@ class TestWriteFile(BasicChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "w_file_data.json") - @pytest.mark.depends(on=[], name="basic_write_file") def test_method(self, config): self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + + workspace = Path(os.getcwd()) / config['workspace'] + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json deleted file mode 100644 index 9e26dfeeb..000000000 --- a/agbenchmark/tests/regression/regression_tests.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/agent/Auto-GPT b/agent/Auto-GPT new file mode 160000 index 000000000..c29ec925f --- /dev/null +++ b/agent/Auto-GPT @@ -0,0 +1 @@ +Subproject commit c29ec925fd9e24f219ef0f2884b08908cd66239b diff --git a/agent/mini-agi b/agent/mini-agi deleted file mode 160000 index d2add8f18..000000000 --- a/agent/mini-agi +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b diff --git a/regression_tests.json b/regression_tests.json new file mode 100644 index 000000000..e3633a2af --- /dev/null +++ b/regression_tests.json @@ -0,0 +1,7 @@ +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" + } +} \ No newline at end of file -- cgit v1.2.3 From 838f72097cc82b9e12dead330632b83056c7b3f6 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 2 Jul 2023 13:14:49 -0700 Subject: Add static linters ci (#45) --- .flake8 | 13 + .github/workflows/autogpt.yml | 10 +- .github/workflows/ci.yml | 68 ++++ .gitmodules | 2 +- .python-version | 1 + agbenchmark/Challenge.py | 124 ------ agbenchmark/agent_interface.py | 29 +- agbenchmark/challenge.py | 126 ++++++ agbenchmark/challenges/define_task_types.py | 6 +- agbenchmark/challenges/retrieval/Retrieval.py | 5 +- agbenchmark/challenges/retrieval/r1/r1_test.py | 6 +- agbenchmark/conftest.py | 26 +- agbenchmark/mocks/MockManager.py | 28 -- agbenchmark/mocks/mock_manager.py | 29 ++ agbenchmark/mocks/tests/basic_mocks.py | 6 +- agbenchmark/mocks/tests/retrieval_mocks.py | 5 +- agbenchmark/start_benchmark.py | 31 +- .../tests/basic_abilities/BasicChallenge.py | 9 - .../tests/basic_abilities/basic_challenge.py | 8 + .../basic_abilities/read_file/read_file_test.py | 13 +- .../basic_abilities/write_file/write_file_test.py | 10 +- mypy.ini | 5 + poetry.lock | 422 ++++++++++++++------- pyproject.toml | 28 +- 24 files changed, 652 insertions(+), 358 deletions(-) create mode 100644 .flake8 create mode 100644 .github/workflows/ci.yml create mode 100644 .python-version delete mode 100644 agbenchmark/Challenge.py create mode 100644 agbenchmark/challenge.py delete mode 100644 agbenchmark/mocks/MockManager.py create mode 100644 agbenchmark/mocks/mock_manager.py delete mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py create mode 100644 agbenchmark/tests/basic_abilities/basic_challenge.py create mode 100644 mypy.ini diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..cb9c777b5 --- /dev/null +++ b/.flake8 @@ -0,0 +1,13 @@ +[flake8] +max-line-length = 88 +select = "E303, W293, W291, W292, E305, E231, E302" +exclude = + .tox, + __pycache__, + *.pyc, + .env + venv*/*, + .venv/*, + reports/*, + dist/*, + agent/* diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml index 2b1925117..e889b4c35 100644 --- a/.github/workflows/autogpt.yml +++ b/.github/workflows/autogpt.yml @@ -46,14 +46,14 @@ jobs: - name: Set up venv and install Python dependencies run: | - python -m venv venv - source venv/bin/activate - poetry install + poetry install --only main + poetry build + - - name: Build project + - name: Run regression tests run: | + python -m venv venv source venv/bin/activate - poetry build cd agent/Auto-GPT pip install -r requirements.txt pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..2d25e4ffe --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,68 @@ +name: Python CI + +on: + push: + branches: [ master, ci-test* ] + pull_request: + branches: [ stable, master, release-* ] + +jobs: + lint: + + runs-on: ubuntu-latest + env: + min-python-version: "3.10" + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + + - name: Set up Python ${{ env.min-python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.min-python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Install dependencies + run: | + poetry install + + - name: Lint with flake8 + run: poetry run flake8 + + - name: Check black formatting + run: poetry run black . --check + if: success() || failure() + + - name: Check isort formatting + run: poetry run isort . --check + if: success() || failure() + + - name: Check mypy formatting + run: poetry run mypy --ignore-missing-imports . + if: success() || failure() + + - name: Check for unused imports and pass statements + run: | + cmd="poetry run autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark" + $cmd --check || (echo "You have unused imports or pass statements, please run '${cmd} --in-place'" && exit 1) + if: success() || failure() diff --git a/.gitmodules b/.gitmodules index b2dc714c5..2e3a86e5f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ -[submodule "Auto-GPT"] +[submodule "agent/Auto-GPT"] path = agent/Auto-GPT url = https://github.com/Significant-Gravitas/Auto-GPT.git branch = benchmark-integration diff --git a/.python-version b/.python-version new file mode 100644 index 000000000..d5cd4cce2 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10.10 diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py deleted file mode 100644 index d7a2bdc9b..000000000 --- a/agbenchmark/Challenge.py +++ /dev/null @@ -1,124 +0,0 @@ -import os -import glob -import pytest -from abc import ABC, abstractmethod -from agbenchmark.challenges.define_task_types import Ground -from agbenchmark.challenges.define_task_types import ChallengeData -from dotenv import load_dotenv - -load_dotenv() - -mock_test_str = os.getenv("MOCK_TEST") -MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False - - -class Challenge(ABC): - """The parent class to all specific challenges classes. - Defines helper methods for running a challenge""" - - @abstractmethod - def get_file_path(self) -> str: - """This should be implemented by any class which inherits from BasicChallenge""" - pass - - @property - def data(self) -> ChallengeData: - # TODO: make it so that this is cached somewhere to just call self.deserialized_data - return ChallengeData.deserialize(self.get_file_path()) - - @property - def mock(self): - return self.data.mock.mock_func if self.data.mock else None - - @property - def task(self): - return ( - self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task - ) - - @property - def dependencies(self) -> list: - return self.data.dependencies - - def setup_challenge(self, config): - from agbenchmark.agent_interface import run_agent - - run_agent(self.task, self.mock, config) - - @property - def name(self) -> str: - return self.data.name - - @pytest.mark.parametrize( - "challenge_data", - [data], - indirect=True, - ) - def test_method(self, config): - raise NotImplementedError - - @staticmethod - def open_file(workspace: str, filename: str): - script_dir = os.path.abspath(workspace) - workspace_dir = os.path.join(script_dir, filename) - with open(workspace_dir, "r") as f: - return f.read() - - @staticmethod - def open_files(workspace: str, file_patterns: list): - script_dir = os.path.abspath(workspace) - files_contents = [] - - for file_pattern in file_patterns: - # Check if it is a file extension - if file_pattern.startswith("."): - # Find all files with the given extension in the workspace - matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern)) - else: - # Otherwise, it is a specific file - matching_files = [os.path.join(script_dir, file_pattern)] - - for file_path in matching_files: - with open(file_path, "r") as f: - files_contents.append(f.read()) - - return files_contents - - @staticmethod - def write_to_file(workspace: str, filename: str, content: str): - script_dir = os.path.abspath(workspace) - print("Writing file at", script_dir) - workspace_dir = os.path.join(script_dir, filename) - - # Open the file in write mode. - with open(workspace_dir, "w") as f: - # Write the content to the file. - f.write(content) - - def get_filenames_in_workspace(self, workspace: str): - return [ - filename - for filename in os.listdir(workspace) - if os.path.isfile(os.path.join(workspace, filename)) - ] - - def scoring(self, content: str, ground: Ground): - if ground.should_contain: - for should_contain_word in ground.should_contain: - if should_contain_word not in content: - return 0.0 - else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) - - if ground.should_not_contain: - for should_not_contain_word in ground.should_not_contain: - if should_not_contain_word in content: - return 0.0 - else: - print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" - ) - - return 1.0 diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 0961dc0f0..bd75f8dbb 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,18 +1,22 @@ import importlib - -from agbenchmark.mocks.MockManager import MockManager import os -import sys import subprocess +import sys import time +from typing import Any, Dict, Optional + from dotenv import load_dotenv +from agbenchmark.mocks.mock_manager import MockManager + load_dotenv() MOCK_FLAG = os.getenv("MOCK_TEST") -def run_agent(task, mock_func, config): +def run_agent( + task: Optional[str], mock_func: Optional[str], config: Dict[str, Any] +) -> None: """Calling to get a response""" if mock_func == None and MOCK_FLAG == "True": @@ -33,18 +37,24 @@ def run_agent(task, mock_func, config): # Add current directory to Python's import path sys.path.append(cwd) - module_name = config["func_path"].replace("/", ".").rstrip(".py") module = importlib.import_module(module_name) - command = [sys.executable, "benchmarks.py", str(task)] - process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, cwd=cwd) + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + cwd=cwd, + ) start_time = time.time() timeout = config["cutoff"] while True: + if process.stdout is None: + continue output = process.stdout.readline() print(output.strip()) @@ -55,7 +65,9 @@ def run_agent(task, mock_func, config): # Check if process has exceeded timeout if time.time() - start_time > timeout: - print("The Python function has exceeded the time limit and was terminated.") + print( + "The Python function has exceeded the time limit and was terminated." + ) process.terminate() break @@ -66,5 +78,4 @@ def run_agent(task, mock_func, config): process.wait() - ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py new file mode 100644 index 000000000..eaed73a22 --- /dev/null +++ b/agbenchmark/challenge.py @@ -0,0 +1,126 @@ +import glob +import os +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + +import pytest +from dotenv import load_dotenv + +from agbenchmark.challenges.define_task_types import ChallengeData, Ground + +load_dotenv() + +mock_test_str = os.getenv("MOCK_TEST") +MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False + + +class Challenge(ABC): + """The parent class to all specific challenges classes. + Defines helper methods for running a challenge""" + + @abstractmethod + def get_file_path(self) -> str: + """This should be implemented by any class which inherits from BasicChallenge""" + pass + + @property + def data(self) -> ChallengeData: + # TODO: make it so that this is cached somewhere to just call self.deserialized_data + return ChallengeData.deserialize(self.get_file_path()) + + @property + def mock(self) -> Optional[str]: + return self.data.mock.mock_func if self.data.mock else None + + @property + def task(self) -> Optional[str]: + return ( + self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task + ) + + @property + def dependencies(self) -> list: + return self.data.dependencies + + def setup_challenge(self, config: Dict[str, Any]) -> None: + from agbenchmark.agent_interface import run_agent + + run_agent(self.task, self.mock, config) + + @property + def name(self) -> str: + return self.data.name + + @pytest.mark.parametrize( + "challenge_data", + [data], + indirect=True, + ) + def test_method(self, config: Dict[str, Any]) -> None: + raise NotImplementedError + + @staticmethod + def open_file(workspace: str, filename: str) -> str: + script_dir = os.path.abspath(workspace) + workspace_dir = os.path.join(script_dir, filename) + with open(workspace_dir, "r") as f: + return f.read() + + @staticmethod + def open_files(workspace: str, file_patterns: list) -> List[str]: + script_dir = os.path.abspath(workspace) + files_contents = [] + + for file_pattern in file_patterns: + # Check if it is a file extension + if file_pattern.startswith("."): + # Find all files with the given extension in the workspace + matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern)) + else: + # Otherwise, it is a specific file + matching_files = [os.path.join(script_dir, file_pattern)] + + for file_path in matching_files: + with open(file_path, "r") as f: + files_contents.append(f.read()) + + return files_contents + + @staticmethod + def write_to_file(workspace: str, filename: str, content: str) -> None: + script_dir = os.path.abspath(workspace) + print("Writing file at", script_dir) + workspace_dir = os.path.join(script_dir, filename) + + # Open the file in write mode. + with open(workspace_dir, "w") as f: + # Write the content to the file. + f.write(content) + + def get_filenames_in_workspace(self, workspace: str) -> List[str]: + return [ + filename + for filename in os.listdir(workspace) + if os.path.isfile(os.path.join(workspace, filename)) + ] + + def scoring(self, content: str, ground: Ground) -> float: + if ground.should_contain: + for should_contain_word in ground.should_contain: + if should_contain_word not in content: + return 0.0 + else: + print( + f"Word that should exist: {should_contain_word} exists in the content" + ) + + if ground.should_not_contain: + for should_not_contain_word in ground.should_not_contain: + if should_not_contain_word in content: + return 0.0 + else: + print( + f"Word that should not exist: {should_not_contain_word} does not exist in the content" + ) + + return 1.0 diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 7fc2361b5..52df3017b 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -1,7 +1,7 @@ -from pydantic import BaseModel -from typing import List, Optional import json -import os +from typing import List, Optional + +from pydantic import BaseModel class Mock(BaseModel): diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py index b8aa81ce3..891cccef7 100644 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ b/agbenchmark/challenges/retrieval/Retrieval.py @@ -1,9 +1,8 @@ -from agbenchmark.Challenge import Challenge import pytest +from agbenchmark.challenge import Challenge + @pytest.mark.retrieval class RetrievalChallenge(Challenge): """Challenge for information-retrieval""" - - pass diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index b679a731d..675ac8bd7 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,5 +1,7 @@ -from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge import os +from typing import Any, Dict + +from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge class TestRetrieval1(RetrievalChallenge): @@ -8,7 +10,7 @@ class TestRetrieval1(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") - def test_method(self, config): + def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) files_contents = self.open_files(config["workspace"], self.data.ground.files) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 4284d1ebf..613565fd2 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -1,16 +1,16 @@ import json import os -from pathlib import Path +import shutil +from typing import Any, Dict, Generator, List import pytest -import shutil -from agbenchmark.tests.regression.RegressionManager import RegressionManager + from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH +from agbenchmark.tests.regression.RegressionManager import RegressionManager @pytest.fixture(scope="module") -def config(request): - +def config(request: Any) -> None: print(f"Config file: {CONFIG_PATH}") with open(CONFIG_PATH, "r") as f: config = json.load(f) @@ -22,7 +22,7 @@ def config(request): @pytest.fixture(scope="module") -def workspace(config): +def workspace(config: Dict[str, Any]) -> Generator[str, None, None]: yield config["workspace"] # teardown after test function completes for filename in os.listdir(config["workspace"]): @@ -36,19 +36,20 @@ def workspace(config): print(f"Failed to delete {file_path}. Reason: {e}") -def pytest_addoption(parser): +def pytest_addoption(parser: Any) -> None: parser.addoption("--mock", action="store_true", default=False) + regression_manager = RegressionManager(REGRESSION_TESTS_PATH) # this is to get the challenge_data from every test @pytest.fixture(autouse=True) -def challenge_data(request): +def challenge_data(request: Any) -> None: return request.param -def pytest_runtest_makereport(item, call): +def pytest_runtest_makereport(item: Any, call: Any) -> None: if call.when == "call": challenge_data = item.funcargs.get("challenge_data", None) difficulty = challenge_data.info.difficulty if challenge_data else "unknown" @@ -62,7 +63,6 @@ def pytest_runtest_makereport(item, call): "test": file_path, } - print("pytest_runtest_makereport", test_details) if call.excinfo is None: regression_manager.add_test(item.nodeid.split("::")[1], test_details) @@ -70,7 +70,7 @@ def pytest_runtest_makereport(item, call): regression_manager.remove_test(item.nodeid.split("::")[1]) -def pytest_collection_modifyitems(items): +def pytest_collection_modifyitems(items: List[Any]) -> None: """Called once all test items are collected. Used to add regression and depends markers to collected test items.""" for item in items: @@ -80,13 +80,13 @@ def pytest_collection_modifyitems(items): item.add_marker(pytest.mark.regression) -def pytest_sessionfinish(): +def pytest_sessionfinish() -> None: """Called at the end of the session to save regression tests""" regression_manager.save() # this is so that all tests can inherit from the Challenge class -def pytest_generate_tests(metafunc): +def pytest_generate_tests(metafunc: Any) -> None: if "challenge_data" in metafunc.fixturenames: # Get the instance of the test class test_class = metafunc.cls() diff --git a/agbenchmark/mocks/MockManager.py b/agbenchmark/mocks/MockManager.py deleted file mode 100644 index f4e7f5f5a..000000000 --- a/agbenchmark/mocks/MockManager.py +++ /dev/null @@ -1,28 +0,0 @@ -import sys -import agbenchmark.mocks.tests.basic_mocks as basic_mocks -import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks - - -class MockManager: - def __init__(self, task: str): - self.task = task - self.workspace = "agbenchmark/mocks/workspace" - self.modules = [basic_mocks, retrieval_mocks] - - def delegate(self, mock_function_name, *args, **kwargs): - if hasattr(self, mock_function_name): - # Check if the mock function is an attribute of this class - getattr(self, mock_function_name)(*args, **kwargs) - elif mock_function_name in globals(): - # Check if the function is imported in the file - func = globals()[mock_function_name] - func(self.task, self.workspace, *args, **kwargs) - elif len(self.modules) > 0: - # checks if function is in imported modules - for module in self.modules: - if hasattr(module, mock_function_name): - func = getattr(module, mock_function_name) - func(self.task, self.workspace, *args, **kwargs) - return - else: - raise ValueError(f"No such mock: {mock_function_name}") diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py new file mode 100644 index 000000000..59fa8dbf1 --- /dev/null +++ b/agbenchmark/mocks/mock_manager.py @@ -0,0 +1,29 @@ +from typing import Any + +import agbenchmark.mocks.tests.basic_mocks as basic_mocks +import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks + + +class MockManager: + def __init__(self, task: str): + self.task = task + self.workspace = "agbenchmark/mocks/workspace" + self.modules = [basic_mocks, retrieval_mocks] + + def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None: + if hasattr(self, mock_function_name): + # Check if the mock function is an attribute of this class + getattr(self, mock_function_name)(*args, **kwargs) + elif mock_function_name in globals(): + # Check if the function is imported in the file + func = globals()[mock_function_name] + func(self.task, self.workspace, *args, **kwargs) + elif len(self.modules) > 0: + # checks if function is in imported modules + for module in self.modules: + if hasattr(module, mock_function_name): + func = getattr(module, mock_function_name) + func(self.task, self.workspace, *args, **kwargs) + return + else: + raise ValueError(f"No such mock: {mock_function_name}") diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 631b30c2c..c79a8e2dd 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -1,7 +1,7 @@ -from agbenchmark.Challenge import Challenge +from agbenchmark.challenge import Challenge -def basic_read_file_mock(task: str, workspace: str): +def basic_read_file_mock(task: str, workspace: str) -> None: """ This mock reads a file and returns its content. """ @@ -13,7 +13,7 @@ def basic_read_file_mock(task: str, workspace: str): ) -def basic_write_file_mock(task: str, workspace: str): +def basic_write_file_mock(task: str, workspace: str) -> None: """ This mock writes to a file (creates one if it doesn't exist) """ diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py index 2481de060..9a8a57db4 100644 --- a/agbenchmark/mocks/tests/retrieval_mocks.py +++ b/agbenchmark/mocks/tests/retrieval_mocks.py @@ -1,8 +1,5 @@ -from agbenchmark.Challenge import Challenge - - # TODO: Make it so that you can specify for tests to only run if their prerequisites are met. # Prerequisites here would be writing to a file (basic_abilities test). # Should also check if prerequisites exists in regression file -def retrieval_1_mock(task: str, workspace: str): +def retrieval_1_mock(task: str, workspace: str) -> None: pass diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 28b038e9a..13e1af231 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -1,8 +1,11 @@ -import click -import pytest import json import os +import sys from pathlib import Path +from typing import List + +import click +import pytest from dotenv import load_dotenv, set_key load_dotenv() @@ -15,8 +18,9 @@ CONFIG_PATH = str(new_path.resolve()) REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json") + @click.group() -def cli(): +def cli() -> None: pass @@ -24,7 +28,7 @@ def cli(): @click.option("--category", default=None, help="Specific category to run") @click.option("--reg", is_flag=True, help="Runs only regression tests") @click.option("--mock", is_flag=True, help="Run with mock") -def start(category, reg, mock): +def start(category: str, reg: bool, mock: bool) -> int: """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" # Check if configuration file exists and is not empty if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: @@ -61,7 +65,6 @@ def start(category, reg, mock): if not os.path.exists(workspace_path): os.makedirs(workspace_path, exist_ok=True) - if not os.path.exists(REGRESSION_TESTS_PATH): with open(REGRESSION_TESTS_PATH, "a"): pass @@ -74,9 +77,7 @@ def start(category, reg, mock): tests_to_run = [] pytest_args = ["-vs"] if category: - pytest_args.extend( - ["-m", category] - ) + pytest_args.extend(["-m", category]) else: if reg: print("Running all regression tests") @@ -91,20 +92,24 @@ def start(category, reg, mock): if not tests_to_run: tests_to_run = [str(CURRENT_DIRECTORY)] pytest_args.extend(tests_to_run) - pytest.main(pytest_args) + return sys.exit(pytest.main(pytest_args)) -def get_regression_tests(): + +def get_regression_tests() -> List[str]: if not Path(REGRESSION_TESTS_PATH).exists(): - with open(REGRESSION_TESTS_PATH, 'w') as file: + with open(REGRESSION_TESTS_PATH, "w") as file: json.dump({}, file) - with open(REGRESSION_TESTS_PATH, 'r') as file: + with open(REGRESSION_TESTS_PATH, "r") as file: data = json.load(file) - regression_tests = [str(CURRENT_DIRECTORY / ".." / value['test']) for key, value in data.items()] + regression_tests = [ + str(CURRENT_DIRECTORY / ".." / value["test"]) for key, value in data.items() + ] return regression_tests + if __name__ == "__main__": start() diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py deleted file mode 100644 index 6e7f73100..000000000 --- a/agbenchmark/tests/basic_abilities/BasicChallenge.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest -from agbenchmark.Challenge import Challenge -from agbenchmark.challenges.define_task_types import ChallengeData -from abc import abstractmethod - - -@pytest.mark.basic -class BasicChallenge(Challenge): - pass diff --git a/agbenchmark/tests/basic_abilities/basic_challenge.py b/agbenchmark/tests/basic_abilities/basic_challenge.py new file mode 100644 index 000000000..8b3a4db1d --- /dev/null +++ b/agbenchmark/tests/basic_abilities/basic_challenge.py @@ -0,0 +1,8 @@ +import pytest + +from agbenchmark.challenge import Challenge + + +@pytest.mark.basic +class BasicChallenge(Challenge): + pass diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index c0aaa7f93..c5f886d52 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,14 +1,17 @@ -import pytest -from agbenchmark.Challenge import Challenge -from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenge import Challenge +from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" @pytest.fixture(scope="module", autouse=True) - def setup_module(self, workspace): + def setup_module(self, workspace: str) -> None: Challenge.write_to_file( workspace, self.data.ground.files[0], "this is how we're doing" ) @@ -17,7 +20,7 @@ class TestReadFile(BasicChallenge): return os.path.join(os.path.dirname(__file__), "r_file_data.json") @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") - def test_method(self, config): + def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) files_contents = self.open_files(config["workspace"], self.data.ground.files) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 8d3eb5404..05db09657 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,8 +1,8 @@ +import os from pathlib import Path +from typing import Any, Dict -import pytest -from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge -import os +from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge class TestWriteFile(BasicChallenge): @@ -11,10 +11,10 @@ class TestWriteFile(BasicChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "w_file_data.json") - def test_method(self, config): + def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config['workspace'] + workspace = Path(os.getcwd()) / config["workspace"] files_contents = self.open_files(workspace, self.data.ground.files) scores = [] diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..315ecae56 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,5 @@ +[mypy] +follow_imports = skip +check_untyped_defs = True +disallow_untyped_defs = True +exclude = ^(agent/.*\.py)$ diff --git a/poetry.lock b/poetry.lock index a460f988d..e05fc6c04 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,9 +1,10 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "aiohttp" version = "3.8.4" description = "Async http client/server framework (asyncio)" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -112,6 +113,7 @@ speedups = ["Brotli", "aiodns", "cchardet"] name = "aiosignal" version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -126,6 +128,7 @@ frozenlist = ">=1.1.0" name = "async-timeout" version = "4.0.2" description = "Timeout context manager for asyncio programs" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -137,6 +140,7 @@ files = [ name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -151,10 +155,74 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +[[package]] +name = "autoflake" +version = "1.7.8" +description = "Removes unused imports and unused variables" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "autoflake-1.7.8-py3-none-any.whl", hash = "sha256:46373ef69b6714f5064c923bb28bd797c4f8a9497f557d87fc36665c6d956b39"}, + {file = "autoflake-1.7.8.tar.gz", hash = "sha256:e7e46372dee46fa1c97acf310d99d922b63d369718a270809d7c278d34a194cf"}, +] + +[package.dependencies] +pyflakes = ">=1.1.0,<3" +tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} + +[[package]] +name = "black" +version = "22.3.0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.6.2" +files = [ + {file = "black-22.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2497f9c2386572e28921fa8bec7be3e51de6801f7459dffd6e62492531c47e09"}, + {file = "black-22.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5795a0375eb87bfe902e80e0c8cfaedf8af4d49694d69161e5bd3206c18618bb"}, + {file = "black-22.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e3556168e2e5c49629f7b0f377070240bd5511e45e25a4497bb0073d9dda776a"}, + {file = "black-22.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67c8301ec94e3bcc8906740fe071391bce40a862b7be0b86fb5382beefecd968"}, + {file = "black-22.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:fd57160949179ec517d32ac2ac898b5f20d68ed1a9c977346efbac9c2f1e779d"}, + {file = "black-22.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc1e1de68c8e5444e8f94c3670bb48a2beef0e91dddfd4fcc29595ebd90bb9ce"}, + {file = "black-22.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2fc92002d44746d3e7db7cf9313cf4452f43e9ea77a2c939defce3b10b5c82"}, + {file = "black-22.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:a6342964b43a99dbc72f72812bf88cad8f0217ae9acb47c0d4f141a6416d2d7b"}, + {file = "black-22.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:328efc0cc70ccb23429d6be184a15ce613f676bdfc85e5fe8ea2a9354b4e9015"}, + {file = "black-22.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06f9d8846f2340dfac80ceb20200ea5d1b3f181dd0556b47af4e8e0b24fa0a6b"}, + {file = "black-22.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4efa5fad66b903b4a5f96d91461d90b9507a812b3c5de657d544215bb7877a"}, + {file = "black-22.3.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8477ec6bbfe0312c128e74644ac8a02ca06bcdb8982d4ee06f209be28cdf163"}, + {file = "black-22.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:637a4014c63fbf42a692d22b55d8ad6968a946b4a6ebc385c5505d9625b6a464"}, + {file = "black-22.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:863714200ada56cbc366dc9ae5291ceb936573155f8bf8e9de92aef51f3ad0f0"}, + {file = "black-22.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10dbe6e6d2988049b4655b2b739f98785a884d4d6b85bc35133a8fb9a2233176"}, + {file = "black-22.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:cee3e11161dde1b2a33a904b850b0899e0424cc331b7295f2a9698e79f9a69a0"}, + {file = "black-22.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5891ef8abc06576985de8fa88e95ab70641de6c1fca97e2a15820a9b69e51b20"}, + {file = "black-22.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:30d78ba6bf080eeaf0b7b875d924b15cd46fec5fd044ddfbad38c8ea9171043a"}, + {file = "black-22.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee8f1f7228cce7dffc2b464f07ce769f478968bfb3dd1254a4c2eeed84928aad"}, + {file = "black-22.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ee227b696ca60dd1c507be80a6bc849a5a6ab57ac7352aad1ffec9e8b805f21"}, + {file = "black-22.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:9b542ced1ec0ceeff5b37d69838106a6348e60db7b8fdd245294dc1d26136265"}, + {file = "black-22.3.0-py3-none-any.whl", hash = "sha256:bc58025940a896d7e5356952228b68f793cf5fcb342be703c3a2669a1488cb72"}, + {file = "black-22.3.0.tar.gz", hash = "sha256:35020b8886c022ced9282b51b5a875b6d1ab0c387b31a065b84db7c33085ca79"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + [[package]] name = "certifi" version = "2023.5.7" description = "Python package for providing Mozilla's CA Bundle." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -166,6 +234,7 @@ files = [ name = "charset-normalizer" version = "3.1.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -250,6 +319,7 @@ files = [ name = "click" version = "8.1.3" description = "Composable command line interface toolkit" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -264,6 +334,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -275,6 +346,7 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -285,10 +357,28 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "flake8" +version = "3.9.2" +description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +files = [ + {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"}, + {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, +] + +[package.dependencies] +mccabe = ">=0.6.0,<0.7.0" +pycodestyle = ">=2.7.0,<2.8.0" +pyflakes = ">=2.3.0,<2.4.0" + [[package]] name = "frozenlist" version = "1.3.3" description = "A list-like structure which implements collections.abc.MutableSequence" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -372,6 +462,7 @@ files = [ name = "future-fstrings" version = "1.2.0" description = "A backport of fstrings to python<3.6" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -386,6 +477,7 @@ rewrite = ["tokenize-rt (>=3)"] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -397,6 +489,7 @@ files = [ name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -404,10 +497,41 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "isort" +version = "5.12.0" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "isort-5.12.0-py3-none-any.whl", hash = "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"}, + {file = "isort-5.12.0.tar.gz", hash = "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504"}, +] + +[package.extras] +colors = ["colorama (>=0.4.3)"] +pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"] +plugins = ["setuptools"] +requirements-deprecated-finder = ["pip-api", "pipreqs"] + +[[package]] +name = "mccabe" +version = "0.6.1" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] + [[package]] name = "multidict" version = "6.0.4" description = "multidict implementation" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -487,10 +611,64 @@ files = [ {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] +[[package]] +name = "mypy" +version = "0.910" +description = "Optional static typing for Python" +category = "dev" +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"}, + {file = "mypy-0.910-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb"}, + {file = "mypy-0.910-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9"}, + {file = "mypy-0.910-cp35-cp35m-win_amd64.whl", hash = "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e"}, + {file = "mypy-0.910-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921"}, + {file = "mypy-0.910-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6"}, + {file = "mypy-0.910-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212"}, + {file = "mypy-0.910-cp36-cp36m-win_amd64.whl", hash = "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885"}, + {file = "mypy-0.910-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0"}, + {file = "mypy-0.910-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de"}, + {file = "mypy-0.910-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703"}, + {file = "mypy-0.910-cp37-cp37m-win_amd64.whl", hash = "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a"}, + {file = "mypy-0.910-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504"}, + {file = "mypy-0.910-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9"}, + {file = "mypy-0.910-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072"}, + {file = "mypy-0.910-cp38-cp38-win_amd64.whl", hash = "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811"}, + {file = "mypy-0.910-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e"}, + {file = "mypy-0.910-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b"}, + {file = "mypy-0.910-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2"}, + {file = "mypy-0.910-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97"}, + {file = "mypy-0.910-cp39-cp39-win_amd64.whl", hash = "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8"}, + {file = "mypy-0.910-py3-none-any.whl", hash = "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"}, + {file = "mypy-0.910.tar.gz", hash = "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150"}, +] + +[package.dependencies] +mypy-extensions = ">=0.4.3,<0.5.0" +toml = "*" +typing-extensions = ">=3.7.4" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +python2 = ["typed-ast (>=1.4.0,<1.5.0)"] + +[[package]] +name = "mypy-extensions" +version = "0.4.4" +description = "Experimental type system extensions for programs checked with the mypy typechecker." +category = "dev" +optional = false +python-versions = ">=2.7" +files = [ + {file = "mypy_extensions-0.4.4.tar.gz", hash = "sha256:c8b707883a96efe9b4bb3aaf0dcc07e7e217d7d8368eec4db4049ee9e142f4fd"}, +] + [[package]] name = "networkx" version = "3.1" description = "Python package for creating and manipulating graphs and networks" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -509,6 +687,7 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] name = "openai" version = "0.27.8" description = "Python client library for the OpenAI API" +category = "main" optional = false python-versions = ">=3.7.1" files = [ @@ -523,7 +702,7 @@ tqdm = "*" [package.extras] datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] -dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"] +dev = ["black (>=21.6b0,<22.0)", "pytest (>=6.0.0,<7.0.0)", "pytest-asyncio", "pytest-mock"] embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"] wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"] @@ -531,6 +710,7 @@ wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1 name = "packaging" version = "23.1" description = "Core utilities for Python packages" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -539,114 +719,105 @@ files = [ ] [[package]] -name = "pexpect" -version = "4.8.0" -description = "Pexpect allows easy control of interactive console applications." +name = "pathspec" +version = "0.11.1" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" optional = false -python-versions = "*" +python-versions = ">=3.7" files = [ - {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, - {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, + {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, + {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, ] -[package.dependencies] -ptyprocess = ">=0.5" - [[package]] -name = "pluggy" -version = "1.0.0" -description = "plugin and hook calling mechanisms for python" +name = "platformdirs" +version = "3.8.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, - {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, + {file = "platformdirs-3.8.0-py3-none-any.whl", hash = "sha256:ca9ed98ce73076ba72e092b23d3c93ea6c4e186b3f1c3dad6edd98ff6ffcca2e"}, + {file = "platformdirs-3.8.0.tar.gz", hash = "sha256:b0cabcb11063d21a0b261d557acb0a9d2126350e63b70cdf7db6347baea456dc"}, ] [package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] +docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] [[package]] -name = "psutil" -version = "5.9.5" -description = "Cross-platform lib for process and system monitoring in Python." +name = "pluggy" +version = "1.2.0" +description = "plugin and hook calling mechanisms for python" +category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.7" files = [ - {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"}, - {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"}, - {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"}, - {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"}, - {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"}, - {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"}, - {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"}, - {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"}, - {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"}, - {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"}, - {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"}, - {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"}, - {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"}, - {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"}, + {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, + {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] [package.extras] -test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] [[package]] -name = "ptyprocess" -version = "0.7.0" -description = "Run a subprocess in a pseudo terminal" +name = "pycodestyle" +version = "2.7.0" +description = "Python style guide checker" +category = "dev" optional = false -python-versions = "*" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ - {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, - {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, + {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, + {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, ] [[package]] name = "pydantic" -version = "1.10.9" +version = "1.10.10" description = "Data validation and settings management using python type hints" +category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic-1.10.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e692dec4a40bfb40ca530e07805b1208c1de071a18d26af4a2a0d79015b352ca"}, - {file = "pydantic-1.10.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c52eb595db83e189419bf337b59154bdcca642ee4b2a09e5d7797e41ace783f"}, - {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939328fd539b8d0edf244327398a667b6b140afd3bf7e347cf9813c736211896"}, - {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b48d3d634bca23b172f47f2335c617d3fcb4b3ba18481c96b7943a4c634f5c8d"}, - {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f0b7628fb8efe60fe66fd4adadd7ad2304014770cdc1f4934db41fe46cc8825f"}, - {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e1aa5c2410769ca28aa9a7841b80d9d9a1c5f223928ca8bec7e7c9a34d26b1d4"}, - {file = "pydantic-1.10.9-cp310-cp310-win_amd64.whl", hash = "sha256:eec39224b2b2e861259d6f3c8b6290d4e0fbdce147adb797484a42278a1a486f"}, - {file = "pydantic-1.10.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d111a21bbbfd85c17248130deac02bbd9b5e20b303338e0dbe0faa78330e37e0"}, - {file = "pydantic-1.10.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e9aec8627a1a6823fc62fb96480abe3eb10168fd0d859ee3d3b395105ae19a7"}, - {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07293ab08e7b4d3c9d7de4949a0ea571f11e4557d19ea24dd3ae0c524c0c334d"}, - {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee829b86ce984261d99ff2fd6e88f2230068d96c2a582f29583ed602ef3fc2c"}, - {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4b466a23009ff5cdd7076eb56aca537c745ca491293cc38e72bf1e0e00de5b91"}, - {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7847ca62e581e6088d9000f3c497267868ca2fa89432714e21a4fb33a04d52e8"}, - {file = "pydantic-1.10.9-cp311-cp311-win_amd64.whl", hash = "sha256:7845b31959468bc5b78d7b95ec52fe5be32b55d0d09983a877cca6aedc51068f"}, - {file = "pydantic-1.10.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:517a681919bf880ce1dac7e5bc0c3af1e58ba118fd774da2ffcd93c5f96eaece"}, - {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67195274fd27780f15c4c372f4ba9a5c02dad6d50647b917b6a92bf00b3d301a"}, - {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2196c06484da2b3fded1ab6dbe182bdabeb09f6318b7fdc412609ee2b564c49a"}, - {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6257bb45ad78abacda13f15bde5886efd6bf549dd71085e64b8dcf9919c38b60"}, - {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3283b574b01e8dbc982080d8287c968489d25329a463b29a90d4157de4f2baaf"}, - {file = "pydantic-1.10.9-cp37-cp37m-win_amd64.whl", hash = "sha256:5f8bbaf4013b9a50e8100333cc4e3fa2f81214033e05ac5aa44fa24a98670a29"}, - {file = "pydantic-1.10.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9cd67fb763248cbe38f0593cd8611bfe4b8ad82acb3bdf2b0898c23415a1f82"}, - {file = "pydantic-1.10.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f50e1764ce9353be67267e7fd0da08349397c7db17a562ad036aa7c8f4adfdb6"}, - {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73ef93e5e1d3c8e83f1ff2e7fdd026d9e063c7e089394869a6e2985696693766"}, - {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128d9453d92e6e81e881dd7e2484e08d8b164da5507f62d06ceecf84bf2e21d3"}, - {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ad428e92ab68798d9326bb3e5515bc927444a3d71a93b4a2ca02a8a5d795c572"}, - {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fab81a92f42d6d525dd47ced310b0c3e10c416bbfae5d59523e63ea22f82b31e"}, - {file = "pydantic-1.10.9-cp38-cp38-win_amd64.whl", hash = "sha256:963671eda0b6ba6926d8fc759e3e10335e1dc1b71ff2a43ed2efd6996634dafb"}, - {file = "pydantic-1.10.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:970b1bdc6243ef663ba5c7e36ac9ab1f2bfecb8ad297c9824b542d41a750b298"}, - {file = "pydantic-1.10.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7e1d5290044f620f80cf1c969c542a5468f3656de47b41aa78100c5baa2b8276"}, - {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83fcff3c7df7adff880622a98022626f4f6dbce6639a88a15a3ce0f96466cb60"}, - {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0da48717dc9495d3a8f215e0d012599db6b8092db02acac5e0d58a65248ec5bc"}, - {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0a2aabdc73c2a5960e87c3ffebca6ccde88665616d1fd6d3db3178ef427b267a"}, - {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9863b9420d99dfa9c064042304868e8ba08e89081428a1c471858aa2af6f57c4"}, - {file = "pydantic-1.10.9-cp39-cp39-win_amd64.whl", hash = "sha256:e7c9900b43ac14110efa977be3da28931ffc74c27e96ee89fbcaaf0b0fe338e1"}, - {file = "pydantic-1.10.9-py3-none-any.whl", hash = "sha256:6cafde02f6699ce4ff643417d1a9223716ec25e228ddc3b436fe7e2d25a1f305"}, - {file = "pydantic-1.10.9.tar.gz", hash = "sha256:95c70da2cd3b6ddf3b9645ecaa8d98f3d80c606624b6d245558d202cd23ea3be"}, + {file = "pydantic-1.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:adad1ee4ab9888f12dac2529276704e719efcf472e38df7813f5284db699b4ec"}, + {file = "pydantic-1.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a7db03339893feef2092ff7b1afc9497beed15ebd4af84c3042a74abce02d48"}, + {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b3714b97ff84b2689654851c2426389bcabfac9080617bcf4306c69db606f6"}, + {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edfdf0a5abc5c9bf2052ebaec20e67abd52e92d257e4f2d30e02c354ed3e6030"}, + {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20a3b30fd255eeeb63caa9483502ba96b7795ce5bf895c6a179b3d909d9f53a6"}, + {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db4c7f7e60ca6f7d6c1785070f3e5771fcb9b2d88546e334d2f2c3934d949028"}, + {file = "pydantic-1.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:a2d5be50ac4a0976817144c7d653e34df2f9436d15555189f5b6f61161d64183"}, + {file = "pydantic-1.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:566a04ba755e8f701b074ffb134ddb4d429f75d5dced3fbd829a527aafe74c71"}, + {file = "pydantic-1.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f79db3652ed743309f116ba863dae0c974a41b688242482638b892246b7db21d"}, + {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c62376890b819bebe3c717a9ac841a532988372b7e600e76f75c9f7c128219d5"}, + {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4870f13a4fafd5bc3e93cff3169222534fad867918b188e83ee0496452978437"}, + {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:990027e77cda6072a566e433b6962ca3b96b4f3ae8bd54748e9d62a58284d9d7"}, + {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8c40964596809eb616d94f9c7944511f620a1103d63d5510440ed2908fc410af"}, + {file = "pydantic-1.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:ea9eebc2ebcba3717e77cdeee3f6203ffc0e78db5f7482c68b1293e8cc156e5e"}, + {file = "pydantic-1.10.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:762aa598f79b4cac2f275d13336b2dd8662febee2a9c450a49a2ab3bec4b385f"}, + {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dab5219659f95e357d98d70577b361383057fb4414cfdb587014a5f5c595f7b"}, + {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3d4ee957a727ccb5a36f1b0a6dbd9fad5dedd2a41eada99a8df55c12896e18d"}, + {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b69f9138dec566962ec65623c9d57bee44412d2fc71065a5f3ebb3820bdeee96"}, + {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7aa75d1bd9cc275cf9782f50f60cddaf74cbaae19b6ada2a28e737edac420312"}, + {file = "pydantic-1.10.10-cp37-cp37m-win_amd64.whl", hash = "sha256:9f62a727f5c590c78c2d12fda302d1895141b767c6488fe623098f8792255fe5"}, + {file = "pydantic-1.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aac218feb4af73db8417ca7518fb3bade4534fcca6e3fb00f84966811dd94450"}, + {file = "pydantic-1.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:88546dc10a40b5b52cae87d64666787aeb2878f9a9b37825aedc2f362e7ae1da"}, + {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c41bbaae89e32fc582448e71974de738c055aef5ab474fb25692981a08df808a"}, + {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b71bd504d1573b0b722ae536e8ffb796bedeef978979d076bf206e77dcc55a5"}, + {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e088e3865a2270ecbc369924cd7d9fbc565667d9158e7f304e4097ebb9cf98dd"}, + {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3403a090db45d4027d2344859d86eb797484dfda0706cf87af79ace6a35274ef"}, + {file = "pydantic-1.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:e0014e29637125f4997c174dd6167407162d7af0da73414a9340461ea8573252"}, + {file = "pydantic-1.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9965e49c6905840e526e5429b09e4c154355b6ecc0a2f05492eda2928190311d"}, + {file = "pydantic-1.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:748d10ab6089c5d196e1c8be9de48274f71457b01e59736f7a09c9dc34f51887"}, + {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86936c383f7c38fd26d35107eb669c85d8f46dfceae873264d9bab46fe1c7dde"}, + {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a26841be620309a9697f5b1ffc47dce74909e350c5315ccdac7a853484d468a"}, + {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:409b810f387610cc7405ab2fa6f62bdf7ea485311845a242ebc0bd0496e7e5ac"}, + {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ce937a2a2c020bcad1c9fde02892392a1123de6dda906ddba62bfe8f3e5989a2"}, + {file = "pydantic-1.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:37ebddef68370e6f26243acc94de56d291e01227a67b2ace26ea3543cf53dd5f"}, + {file = "pydantic-1.10.10-py3-none-any.whl", hash = "sha256:a5939ec826f7faec434e2d406ff5e4eaf1716eb1f247d68cd3d0b3612f7b4c8a"}, + {file = "pydantic-1.10.10.tar.gz", hash = "sha256:3b8d5bd97886f9eb59260594207c9f57dce14a6f869c6ceea90188715d29921a"}, ] [package.dependencies] @@ -656,15 +827,28 @@ typing-extensions = ">=4.2.0" dotenv = ["python-dotenv (>=0.10.4)"] email = ["email-validator (>=1.0.3)"] +[[package]] +name = "pyflakes" +version = "2.3.1" +description = "passive checker of Python programs" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, + {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, +] + [[package]] name = "pytest" -version = "7.3.2" +version = "7.4.0" description = "pytest: simple powerful testing with Python" +category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"}, - {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"}, + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, ] [package.dependencies] @@ -682,6 +866,7 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no name = "pytest-depends" version = "1.0.1" description = "Tests that depend on other tests" +category = "main" optional = false python-versions = "*" files = [ @@ -699,6 +884,7 @@ pytest = ">=3" name = "python-dotenv" version = "1.0.0" description = "Read key-value pairs from a .env file and set them as environment variables" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -709,33 +895,11 @@ files = [ [package.extras] cli = ["click (>=5.0)"] -[[package]] -name = "pywin32" -version = "306" -description = "Python for Window Extensions" -optional = false -python-versions = "*" -files = [ - {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, - {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, - {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"}, - {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"}, - {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"}, - {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"}, - {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"}, - {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"}, - {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"}, - {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"}, - {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"}, - {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"}, - {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, - {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, -] - [[package]] name = "requests" version = "2.31.0" description = "Python HTTP for Humans." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -753,10 +917,23 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] + [[package]] name = "tomli" version = "2.0.1" description = "A lil' TOML parser" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -768,6 +945,7 @@ files = [ name = "tqdm" version = "4.65.0" description = "Fast, Extensible Progress Meter" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -786,19 +964,21 @@ telegram = ["requests"] [[package]] name = "typing-extensions" -version = "4.6.3" +version = "4.7.0" description = "Backported and Experimental Type Hints for Python 3.7+" +category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"}, - {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"}, + {file = "typing_extensions-4.7.0-py3-none-any.whl", hash = "sha256:5d8c9dac95c27d20df12fb1d97b9793ab8b2af8a3a525e68c80e21060c161771"}, + {file = "typing_extensions-4.7.0.tar.gz", hash = "sha256:935ccf31549830cda708b42289d44b6f74084d616a00be651601a4f968e77c82"}, ] [[package]] name = "urllib3" version = "2.0.3" description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -812,27 +992,11 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] -[[package]] -name = "wexpect" -version = "4.0.0" -description = "Windows alternative of pexpect" -optional = false -python-versions = "*" -files = [ - {file = "wexpect-4.0.0.tar.gz", hash = "sha256:de9e739e78ec4d74a39bf8499904dacb6c594007a674fb7e10752c9b131f6522"}, -] - -[package.dependencies] -psutil = ">=5.0.0" -pywin32 = ">=220" - -[package.extras] -test = ["codecov", "coverage", "pyinstaller", "setuptools (>=38.0)", "tox", "twine"] - [[package]] name = "yarl" version = "1.9.2" description = "Yet another URL library" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -918,5 +1082,5 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" -python-versions = "^3.9" -content-hash = "8ab722acade739b9fb841ecae3b8cabd4f1d8a355864573a93d9faa11dcffb90" +python-versions = "^3.10" +content-hash = "7b5ef821765fd03ae347d42a62be71cb50e97b778544da90a06d35e1808f8ac3" diff --git a/pyproject.toml b/pyproject.toml index 043fe68a2..b458f44bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,15 +8,21 @@ readme = "README.md" packages = [{include = "agbenchmark"}] [tool.poetry.dependencies] -python = "^3.9" +python = "^3.10" pytest = "^7.3.2" -click = "^8.1.3" requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^1.0.0" +click = "^8.1.3" +[tool.poetry.group.dev.dependencies] +flake8 = "^3.9.2" +mypy = "^0.910" +isort = "^5.9.3" +black = "22.3" +autoflake = "^1.4" [build-system] requires = ["poetry-core"] @@ -36,3 +42,21 @@ markers = [ [tool.poetry.scripts] agbenchmark = "agbenchmark.start_benchmark:cli" + +[tool.black] +line-length = 88 +target-version = ['py310'] +include = '\.pyi?$' +packages = ["autogpt"] +extend-exclude = '(/dist|/.venv|/venv|/build|/agent)/' + +[tool.isort] +profile = "black" +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true +line_length = 88 +sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] +skip_glob = [".tox", "__pycache__", "*.pyc", "venv*/*", "reports", "venv", "env", "node_modules", ".env", ".venv", "dist", "agent/*"] -- cgit v1.2.3 From 07133fb04103776bf639dfb5380d1c7dbb36fb92 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 3 Jul 2023 11:42:24 -0700 Subject: Run regression tests on push to master and stable (#46) --- .github/workflows/autogpt.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml index e889b4c35..4316f36ff 100644 --- a/.github/workflows/autogpt.yml +++ b/.github/workflows/autogpt.yml @@ -2,6 +2,9 @@ name: Auto-GPT Regression Test on: workflow_dispatch: + branches: [ master ] + push: + branches: [ stable, master, ci-test* ] jobs: regression-tests: -- cgit v1.2.3 From 101ffdbce03086b3ef5cd56ef46bff2e58f99783 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 3 Jul 2023 11:53:28 -0700 Subject: Integrate with gpt engineer (#47) --- .github/workflows/gpt-engineer.yml | 70 +++++++++++++++++++++++++++ .gitmodules | 4 ++ agbenchmark/challenges/retrieval/Retrieval.py | 8 --- agbenchmark/challenges/retrieval/retrieval.py | 8 +++ agbenchmark/config.json | 5 -- agbenchmark/start_benchmark.py | 3 +- agent/Auto-GPT | 2 +- agent/gpt-engineer | 1 + config.json | 5 ++ poetry.lock | 17 +++---- pyproject.toml | 2 +- 11 files changed, 99 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/gpt-engineer.yml delete mode 100644 agbenchmark/challenges/retrieval/Retrieval.py create mode 100644 agbenchmark/challenges/retrieval/retrieval.py delete mode 100644 agbenchmark/config.json create mode 160000 agent/gpt-engineer create mode 100644 config.json diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml new file mode 100644 index 000000000..e0dbac2f0 --- /dev/null +++ b/.github/workflows/gpt-engineer.yml @@ -0,0 +1,70 @@ +name: gpt-engineer Regression Test + +on: + workflow_dispatch: + branches: [ master ] + push: + branches: [ stable, master, ci-test* ] + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ["3.10"] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + poetry install --only main + poetry build + + - name: Run regression tests + run: | + cd agent/gpt-engineer + make install + source venv/bin/activate + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --reg + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + - name: Upload logs as artifact + if: always() + uses: actions/upload-artifact@v3 + with: + name: gpt-engineer-projects + path: agent/gpt-engineer/projects diff --git a/.gitmodules b/.gitmodules index 2e3a86e5f..b5b7ba249 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,7 @@ path = agent/Auto-GPT url = https://github.com/Significant-Gravitas/Auto-GPT.git branch = benchmark-integration +[submodule "agent/gpt-engineer"] + path = agent/gpt-engineer + url = https://github.com/merwanehamadi/gpt-engineer.git + branch = benchmark-integration diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py deleted file mode 100644 index 891cccef7..000000000 --- a/agbenchmark/challenges/retrieval/Retrieval.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - -from agbenchmark.challenge import Challenge - - -@pytest.mark.retrieval -class RetrievalChallenge(Challenge): - """Challenge for information-retrieval""" diff --git a/agbenchmark/challenges/retrieval/retrieval.py b/agbenchmark/challenges/retrieval/retrieval.py new file mode 100644 index 000000000..891cccef7 --- /dev/null +++ b/agbenchmark/challenges/retrieval/retrieval.py @@ -0,0 +1,8 @@ +import pytest + +from agbenchmark.challenge import Challenge + + +@pytest.mark.retrieval +class RetrievalChallenge(Challenge): + """Challenge for information-retrieval""" diff --git a/agbenchmark/config.json b/agbenchmark/config.json deleted file mode 100644 index e1c5f154b..000000000 --- a/agbenchmark/config.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "workspace": "autogpt/workspace/auto_gpt_workspace", - "func_path": "benchmarks.py", - "cutoff": 60 -} diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 13e1af231..7489aa309 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -12,9 +12,8 @@ load_dotenv() CURRENT_DIRECTORY = Path(__file__).resolve().parent -new_path = CURRENT_DIRECTORY / "config.json" -CONFIG_PATH = str(new_path.resolve()) +CONFIG_PATH = str(Path(os.getcwd()) / "config.json") REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json") diff --git a/agent/Auto-GPT b/agent/Auto-GPT index c29ec925f..2e5eac51d 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit c29ec925fd9e24f219ef0f2884b08908cd66239b +Subproject commit 2e5eac51d06d495919d720d370c4d9efd49f4784 diff --git a/agent/gpt-engineer b/agent/gpt-engineer new file mode 160000 index 000000000..f91ac66b8 --- /dev/null +++ b/agent/gpt-engineer @@ -0,0 +1 @@ +Subproject commit f91ac66b8e8210760aaa0047f2ca11c52e55aaa5 diff --git a/config.json b/config.json new file mode 100644 index 000000000..652618e4b --- /dev/null +++ b/config.json @@ -0,0 +1,5 @@ +{ + "workspace": "projects/my-new-project/workspace", + "func_path": "benchmarks.py", + "cutoff": 60 +} diff --git a/poetry.lock b/poetry.lock index e05fc6c04..4eae340b6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -210,7 +210,6 @@ mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] colorama = ["colorama (>=0.4.3)"] @@ -882,14 +881,14 @@ pytest = ">=3" [[package]] name = "python-dotenv" -version = "1.0.0" +version = "0.21.1" description = "Read key-value pairs from a .env file and set them as environment variables" category = "main" optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, - {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, + {file = "python-dotenv-0.21.1.tar.gz", hash = "sha256:1c93de8f636cde3ce377292818d0e440b6e45a82f215c3744979151fa8151c49"}, + {file = "python_dotenv-0.21.1-py3-none-any.whl", hash = "sha256:41e12e0318bebc859fcc4d97d4db8d20ad21721a6aa5047dd59f090391cb549a"}, ] [package.extras] @@ -964,14 +963,14 @@ telegram = ["requests"] [[package]] name = "typing-extensions" -version = "4.7.0" +version = "4.7.1" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "typing_extensions-4.7.0-py3-none-any.whl", hash = "sha256:5d8c9dac95c27d20df12fb1d97b9793ab8b2af8a3a525e68c80e21060c161771"}, - {file = "typing_extensions-4.7.0.tar.gz", hash = "sha256:935ccf31549830cda708b42289d44b6f74084d616a00be651601a4f968e77c82"}, + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] [[package]] @@ -1083,4 +1082,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "7b5ef821765fd03ae347d42a62be71cb50e97b778544da90a06d35e1808f8ac3" +content-hash = "44b5789494e73f3cb8bcb9d25daa62143e59352a246fd7724fdb3ad58c2560ae" diff --git a/pyproject.toml b/pyproject.toml index b458f44bd..7e95969af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ requests = "^2.31.0" openai = "^0.27.8" pydantic = "^1.10.9" pytest-depends = "^1.0.1" -python-dotenv = "^1.0.0" +python-dotenv = "^0.21.0" click = "^8.1.3" [tool.poetry.group.dev.dependencies] -- cgit v1.2.3 From f183e91ccd1067f3381010687b578554183121b0 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 3 Jul 2023 17:28:29 -0700 Subject: Integrate smol developer with agbenchmark (#48) --- .github/workflows/smol-developer.yml | 64 ++++++++++++++++++++++++++++++++++++ .gitmodules | 4 +++ agent/smol-developer | 1 + 3 files changed, 69 insertions(+) create mode 100644 .github/workflows/smol-developer.yml create mode 160000 agent/smol-developer diff --git a/.github/workflows/smol-developer.yml b/.github/workflows/smol-developer.yml new file mode 100644 index 000000000..13ee8cf8d --- /dev/null +++ b/.github/workflows/smol-developer.yml @@ -0,0 +1,64 @@ +name: smol developer Regression Test + +on: + workflow_dispatch: + branches: [ master ] + push: + branches: [ stable, master, ci-test* ] + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ["3.10"] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + poetry install --only main + poetry build + + - name: Run regression tests + run: | + cd agent/smol-developer + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --reg + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.gitmodules b/.gitmodules index b5b7ba249..b45a16ada 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,7 @@ path = agent/gpt-engineer url = https://github.com/merwanehamadi/gpt-engineer.git branch = benchmark-integration +[submodule "agent/smol-developer"] + path = agent/smol-developer + url = https://github.com/merwanehamadi/developer.git + branch = benchmark-integration diff --git a/agent/smol-developer b/agent/smol-developer new file mode 160000 index 000000000..896198af5 --- /dev/null +++ b/agent/smol-developer @@ -0,0 +1 @@ +Subproject commit 896198af51dd86dc3cfc2e258c3479948844e283 -- cgit v1.2.3 From 5318535d0d26bbd819c135a5f1b8022133c79fcb Mon Sep 17 00:00:00 2001 From: James Date: Tue, 4 Jul 2023 21:28:02 +0800 Subject: Fix summarize_text recursion calls (#4876) `summarize_text` is currently broken, because it calls itself with the wrong args (missing `config`) --- autogpt/processing/text.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/autogpt/processing/text.py b/autogpt/processing/text.py index 24851b1c4..cf81fa71d 100644 --- a/autogpt/processing/text.py +++ b/autogpt/processing/text.py @@ -131,13 +131,12 @@ def summarize_text( logger.info( f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens" ) - summary, _ = summarize_text(chunk, instruction) + summary, _ = summarize_text(chunk, config, instruction) summaries.append(summary) logger.info(f"Summarized {len(chunks)} chunks") - summary, _ = summarize_text("\n\n".join(summaries)) - + summary, _ = summarize_text("\n\n".join(summaries), config, instruction) return summary.strip(), [ (summaries[i], chunks[i][0]) for i in range(0, len(chunks)) ] -- cgit v1.2.3 From 7f098d5fb6e652f78267294da4dfe5296760e031 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 4 Jul 2023 09:13:29 -0700 Subject: Explain how to benchmark new agents (#49) --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 504132ddb..c0f67f153 100644 --- a/README.md +++ b/README.md @@ -115,3 +115,14 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git | | |-- basic_abilities/ **every llm should pass these challenges** | | |-- regression/ **challenges that already passed** ``` + +## How to add new agents to agbenchmark ? +Example with smol developer. + +1- Create a github branch with your agent following the same pattern as this example: + +https://github.com/smol-ai/developer/pull/114/files + +2- Create the submodule and the github workflow by following the same pattern as this example: + +https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files -- cgit v1.2.3 From e25f6103443b83f017c4d0bd3a7be9c98cf7e83a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 4 Jul 2023 13:23:00 -0400 Subject: local runs, home_path config, submodule miniagi (#50) --- .github/workflows/mini-agi.yml | 63 ++++++++++ .gitmodules | 4 + README.md | 128 ++------------------- agbenchmark/README.md | 126 ++++++++++++++++++++ agbenchmark/agent_interface.py | 11 +- agbenchmark/start_benchmark.py | 2 +- .../basic_abilities/write_file/write_file_test.py | 2 + agent/Auto-GPT | 2 +- agent/benchmarks.py | 15 --- agent/benchmarks_example.py | 35 ++++++ agent/config_example.json | 6 + agent/gpt-engineer | 2 +- agent/mini-agi | 1 + agent/regression_tests_example.json | 7 ++ agent/smol-developer | 2 +- config.json | 3 +- 16 files changed, 262 insertions(+), 147 deletions(-) create mode 100644 .github/workflows/mini-agi.yml create mode 100644 agbenchmark/README.md delete mode 100644 agent/benchmarks.py create mode 100644 agent/benchmarks_example.py create mode 100644 agent/config_example.json create mode 160000 agent/mini-agi create mode 100644 agent/regression_tests_example.json diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml new file mode 100644 index 000000000..92980572a --- /dev/null +++ b/.github/workflows/mini-agi.yml @@ -0,0 +1,63 @@ +name: mini-agi Regression Test + +on: + workflow_dispatch: + branches: [master] + push: + branches: [stable, master, ci-test*] + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ['3.10'] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + poetry install --only main + poetry build + + - name: Run regression tests + run: | + cd agent/mini-agi + make install + source venv/bin/activate + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --reg + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.gitmodules b/.gitmodules index b45a16ada..5af445f7a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,6 +6,10 @@ path = agent/gpt-engineer url = https://github.com/merwanehamadi/gpt-engineer.git branch = benchmark-integration +[submodule "agent/mini-agi"] + path = agent/mini-agi + url = https://github.com/SilenNaihin/mini-agi.git + branch = benchmark-integration [submodule "agent/smol-developer"] path = agent/smol-developer url = https://github.com/merwanehamadi/developer.git diff --git a/README.md b/README.md index c0f67f153..ed348b5ab 100644 --- a/README.md +++ b/README.md @@ -2,127 +2,13 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work -## As a user +### Scores: -1. `pip install auto-gpt-benchmarks` -2. Add boilerplate code to run and kill agent -3. `agbenchmark start` - - `--category challenge_category` to run tests in a specific category - - `--mock` to only run mock tests if they exists for each test - - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests -4. We call boilerplate code for your agent -5. Show pass rate of tests, logs, and any other metrics +Scoring of agents will go here. Both overall and by category. -## Contributing +### Integrated Agents -##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x - -### To run the existing mocks - -1. clone the repo `auto-gpt-benchmarks` -2. `pip install poetry` -3. `poetry shell` -4. `poetry install` -5. `cp .env_example .env` -6. `agbenchmark start --mock` - Keep config the same and watch the logs :) - -### To run with mini-agi - -1. Navigate to `auto-gpt-benchmarks/agent/mini-agi` -2. `pip install -r requirements.txt` -3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed -4. Make sure to follow the commands above, and remove mock flag `agbenchmark start` - -- To add requirements `poetry add requirement`. - -Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. - -If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit - -Let people know what beautiful code you write does, document everything well - -Share your progress :) - -### Pytest - -an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic - -```python -import pytest -from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge -import os - - -class TestWriteFile(BasicChallenge): - """Testing if LLM can write to a file""" - - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "w_file_data.json") - - @pytest.mark.depends(on=[], name="basic_write_file") - def test_method(self, workspace): - # implement scoring logic by looking at workspace -``` - -All challenges will inherit from parent class which has the mark and any specific methods for their category - -```python -@pytest.mark.basic -class BasicChallenge(Challenge): - pass -``` - -Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. -```python -@pytest.fixture( - scope="module", autouse=True - ) # this is specific to setting up a file for the test, not all tests have this - def setup_module(self, workspace): - Challenge.write_to_file( - workspace, self.data.ground.files[0], "this is how we're doing" - ) -``` - -#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) - -## Workspace - -If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config - -#### Dataset - -Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ - -## Repo - -``` -|-- auto-gpt-benchmarks/ **main project directory** -| |-- metrics.py **combining scores, metrics, final evaluation** -| |-- start_benchmark.py **entry point from cli** -| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** -| |-- Challenge.py **easy challenge creation class** -| |-- config.json **workspace folder** -| |-- challenges/ **challenges across different domains** -| | |-- adaptability/ -| | |-- basic_abilities/ -| | |-- code/ -| | |-- memory/ -| | |-- retrieval/ -| | |-- web_navigation/ -| | |-- writing/ -| |-- tests/ -| | |-- basic_abilities/ **every llm should pass these challenges** -| | |-- regression/ **challenges that already passed** -``` - -## How to add new agents to agbenchmark ? -Example with smol developer. - -1- Create a github branch with your agent following the same pattern as this example: - -https://github.com/smol-ai/developer/pull/114/files - -2- Create the submodule and the github workflow by following the same pattern as this example: - -https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files +- Auto-GPT +- gpt-engineer +- mini-agi +- smol-developer diff --git a/agbenchmark/README.md b/agbenchmark/README.md new file mode 100644 index 000000000..a478f83f3 --- /dev/null +++ b/agbenchmark/README.md @@ -0,0 +1,126 @@ +## As a user + +1. `pip install auto-gpt-benchmarks` +2. Add boilerplate code to run and kill agent +3. `agbenchmark start` + - `--category challenge_category` to run tests in a specific category + - `--mock` to only run mock tests if they exists for each test + - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests +4. We call boilerplate code for your agent +5. Show pass rate of tests, logs, and any other metrics + +## Contributing + +##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x + +### To run the existing mocks + +1. clone the repo `auto-gpt-benchmarks` +2. `pip install poetry` +3. `poetry shell` +4. `poetry install` +5. `cp .env_example .env` +6. `agbenchmark start --mock` + Keep config the same and watch the logs :) + +### To run with mini-agi + +1. Navigate to `auto-gpt-benchmarks/agent/mini-agi` +2. `pip install -r requirements.txt` +3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed +4. Make sure to follow the commands above, and remove mock flag `agbenchmark start` + +- To add requirements `poetry add requirement`. + +Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access. + +If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit + +Let people know what beautiful code you write does, document everything well + +Share your progress :) + +### Pytest + +an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic + +```python +import pytest +from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge +import os + + +class TestWriteFile(BasicChallenge): + """Testing if LLM can write to a file""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "w_file_data.json") + + @pytest.mark.depends(on=[], name="basic_write_file") + def test_method(self, workspace): + # implement scoring logic by looking at workspace +``` + +All challenges will inherit from parent class which has the mark and any specific methods for their category + +```python +@pytest.mark.basic +class BasicChallenge(Challenge): + pass +``` + +Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. + +```python +@pytest.fixture( + scope="module", autouse=True + ) # this is specific to setting up a file for the test, not all tests have this + def setup_module(self, workspace): + Challenge.write_to_file( + workspace, self.data.ground.files[0], "this is how we're doing" + ) +``` + +#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) + +## Workspace + +If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config + +#### Dataset + +Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ + +## Repo + +``` +|-- auto-gpt-benchmarks/ **main project directory** +| |-- metrics.py **combining scores, metrics, final evaluation** +| |-- start_benchmark.py **entry point from cli** +| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** +| |-- Challenge.py **easy challenge creation class** +| |-- config.json **workspace folder** +| |-- challenges/ **challenges across different domains** +| | |-- adaptability/ +| | |-- basic_abilities/ +| | |-- code/ +| | |-- memory/ +| | |-- retrieval/ +| | |-- web_navigation/ +| | |-- writing/ +| |-- tests/ +| | |-- basic_abilities/ **every llm should pass these challenges** +| | |-- regression/ **challenges that already passed** +``` + +## How to add new agents to agbenchmark ? + +Example with smol developer. + +1- Create a github branch with your agent following the same pattern as this example: + +https://github.com/smol-ai/developer/pull/114/files + +2- Create the submodule and the github workflow by following the same pattern as this example: + +https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index bd75f8dbb..993aa242a 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,4 +1,3 @@ -import importlib import os import subprocess import sys @@ -29,18 +28,18 @@ def run_agent( mock_manager.delegate(mock_func) else: timeout = config["cutoff"] - print(f"Running Python function '{config['func_path']}' with timeout {timeout}") + print( + f"Running Python function '{config['entry_path']}' with timeout {timeout}" + ) # Get the current working directory cwd = os.getcwd() # Add current directory to Python's import path sys.path.append(cwd) + sys.path.append(os.path.join(cwd, config["home_path"])) - module_name = config["func_path"].replace("/", ".").rstrip(".py") - module = importlib.import_module(module_name) - - command = [sys.executable, "benchmarks.py", str(task)] + command = [sys.executable, config["entry_path"], str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 7489aa309..8ef01d3c5 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -38,7 +38,7 @@ def start(category: str, reg: bool, mock: bool) -> int: default=os.path.join(Path.home(), "workspace"), ) - config["func_path"] = click.prompt( + config["entry_path"] = click.prompt( "Please enter a the path to your run_specific_agent function implementation", default="/benchmarks.py", ) diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 05db09657..c59e03ccf 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,7 @@ import os from pathlib import Path from typing import Any, Dict +import pytest from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge @@ -11,6 +12,7 @@ class TestWriteFile(BasicChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "w_file_data.json") + @pytest.mark.depends(name="basic_write_file") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 2e5eac51d..dd65cc256 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 2e5eac51d06d495919d720d370c4d9efd49f4784 +Subproject commit dd65cc256ca72cb199fe8c5d6ae31c23a7acee62 diff --git a/agent/benchmarks.py b/agent/benchmarks.py deleted file mode 100644 index eb66412c1..000000000 --- a/agent/benchmarks.py +++ /dev/null @@ -1,15 +0,0 @@ -# import subprocess - - -def run_specific_agent(task, conn): - cycle_count = 0 - while ( - not conn.poll() - ): # Check if there's a termination signal from the main process - response = run_agent(task) # run the agent and get the response and cycle count - - if response: - cycle_count += 1 - - # Send response and cycle count back to the main process - conn.send((response, cycle_count)) diff --git a/agent/benchmarks_example.py b/agent/benchmarks_example.py new file mode 100644 index 000000000..0c35aa9bb --- /dev/null +++ b/agent/benchmarks_example.py @@ -0,0 +1,35 @@ +import os +import sys +from typing import Tuple +import pexpect + + +def run_specific_agent(task: str) -> Tuple[str, int]: + # Ensure the directory for the project exists + os.makedirs("workspace_path", exist_ok=True) + + # Run the agent command + child = pexpect.spawn(f"python example.py {task}") + + # Create a loop to continuously read output + while True: + try: + child.expect("\n") # This waits until a newline appears + print(child.before.decode()) # This prints the line + except pexpect.EOF: + break # No more output, break the loop + + # Check the exit status + child.close() # Close the child process + + # Return child process's exit status and any error messages + return child.before.decode(), child.exitstatus + + +if __name__ == "__main__": + # The first argument is the script name itself, second is the task + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + task = sys.argv[1] + run_specific_agent(task) diff --git a/agent/config_example.json b/agent/config_example.json new file mode 100644 index 000000000..ba2ec0b80 --- /dev/null +++ b/agent/config_example.json @@ -0,0 +1,6 @@ +{ + "workspace": "projects/my-new-project/workspace", + "entry_path": "benchmarks.py", + "home_path": "", + "cutoff": 60 +} diff --git a/agent/gpt-engineer b/agent/gpt-engineer index f91ac66b8..155ea895e 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit f91ac66b8e8210760aaa0047f2ca11c52e55aaa5 +Subproject commit 155ea895eb5f7e44ed8647b335d90a03b5ffb06d diff --git a/agent/mini-agi b/agent/mini-agi new file mode 160000 index 000000000..70bd3f035 --- /dev/null +++ b/agent/mini-agi @@ -0,0 +1 @@ +Subproject commit 70bd3f035e7d898221cdb0fc2912d20037fec901 diff --git a/agent/regression_tests_example.json b/agent/regression_tests_example.json new file mode 100644 index 000000000..a0c76dc55 --- /dev/null +++ b/agent/regression_tests_example.json @@ -0,0 +1,7 @@ +{ + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" + } +} diff --git a/agent/smol-developer b/agent/smol-developer index 896198af5..5a3ad4310 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit 896198af51dd86dc3cfc2e258c3479948844e283 +Subproject commit 5a3ad43103b238b9c8f2a2acceff250888be263e diff --git a/config.json b/config.json index 652618e4b..ba2ec0b80 100644 --- a/config.json +++ b/config.json @@ -1,5 +1,6 @@ { "workspace": "projects/my-new-project/workspace", - "func_path": "benchmarks.py", + "entry_path": "benchmarks.py", + "home_path": "", "cutoff": 60 } -- cgit v1.2.3 From 73a3e9e42df7caf7d6c65e83898ad9893a829ede Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 4 Jul 2023 13:41:54 -0400 Subject: fixing mini-agi workflow --- .github/workflows/mini-agi.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml index 92980572a..c62df7663 100644 --- a/.github/workflows/mini-agi.yml +++ b/.github/workflows/mini-agi.yml @@ -55,8 +55,8 @@ jobs: - name: Run regression tests run: | cd agent/mini-agi - make install - source venv/bin/activate + pip install -r requirements.txt + cp .env_example .env pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl agbenchmark start --reg env: -- cgit v1.2.3 From ed9aef5f437abc90c314c2e872623c0a2cb3d933 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 4 Jul 2023 13:51:04 -0400 Subject: adding venv to mini-agi --- .github/workflows/mini-agi.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml index c62df7663..baf9100e8 100644 --- a/.github/workflows/mini-agi.yml +++ b/.github/workflows/mini-agi.yml @@ -55,6 +55,8 @@ jobs: - name: Run regression tests run: | cd agent/mini-agi + python -m venv venv + source venv/bin/activate pip install -r requirements.txt cp .env_example .env pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl -- cgit v1.2.3 From 62d37755bc72500f4e057450bc3c30b7d9e1e341 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 4 Jul 2023 14:11:38 -0400 Subject: updating submodule commit --- agent/mini-agi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/mini-agi b/agent/mini-agi index 70bd3f035..d99220c05 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 70bd3f035e7d898221cdb0fc2912d20037fec901 +Subproject commit d99220c058c9fb45f83256ea361b55dba506fa75 -- cgit v1.2.3 From e6e92e99525ef58ac179254d08fe52d2281c1fcc Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 4 Jul 2023 14:15:04 -0400 Subject: run in continuous --- .github/workflows/mini-agi.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml index baf9100e8..055df304d 100644 --- a/.github/workflows/mini-agi.yml +++ b/.github/workflows/mini-agi.yml @@ -63,3 +63,4 @@ jobs: agbenchmark start --reg env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PROMPT_USER: false -- cgit v1.2.3 From f8e550773f49aff8b39750b5a5ac37224d6609ce Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 4 Jul 2023 15:28:00 -0700 Subject: Add retrieval challenge test + run tests on CI pipeline (#51) --- .github/workflows/ci.yml | 40 ++++++++++++++++++++++ agbenchmark/challenges/retrieval/r1/r1_data.json | 18 +++++----- agbenchmark/challenges/retrieval/r1/r1_test.py | 10 ++++-- agbenchmark/mocks/tests/basic_mocks.py | 11 ++++++ agbenchmark/mocks/workspace/file_to_check.txt | 1 - .../basic_abilities/write_file/write_file_test.py | 1 + mypy.ini | 13 +++++++ regression_tests.json | 12 +++++++ 8 files changed, 94 insertions(+), 12 deletions(-) delete mode 100644 agbenchmark/mocks/workspace/file_to_check.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2d25e4ffe..6fb348e9b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,3 +66,43 @@ jobs: cmd="poetry run autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark" $cmd --check || (echo "You have unused imports or pass statements, please run '${cmd} --in-place'" && exit 1) if: success() || failure() + + tests: + + runs-on: ubuntu-latest + env: + min-python-version: "3.10" + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + + - name: Set up Python ${{ env.min-python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ env.min-python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Install dependencies + run: | + poetry install + poetry run agbenchmark start --mock diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 80c5e51eb..44fce24ea 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -1,21 +1,21 @@ { - "name": "retrieval1", - "category": ["basic"], - "task": "Print the the capital of America to a .txt file", + "name": "basic_information_retrieval", + "category": ["retrieval"], + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", "dependencies": [], "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "answer": "£25.89", + "should_contain": ["25.89"], + "should_not_contain": [], "files": [".txt"] }, "mock": { - "mock_func": "basic_write_file_mock", - "mock_task": "What is the capital of America?" + "mock_func": "basic_retrieval_mock", + "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file." }, "info": { "difficulty": "basic", - "description": "Tests the writing to file", + "description": "Tests ability to retrieve information from a website.", "side_effects": ["tests if there is in fact an LLM attached"] } } diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 675ac8bd7..914784960 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,18 +1,24 @@ import os +from pathlib import Path from typing import Any, Dict +import pytest + from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge -class TestRetrieval1(RetrievalChallenge): +class TestRetrieval(RetrievalChallenge): """The first information-retrieval challenge""" def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") + @pytest.mark.depends(on=["basic_write_file"]) def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + + workspace = Path(os.getcwd()) / config["workspace"] + files_contents = self.open_files(workspace, self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index c79a8e2dd..07d8a6de0 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -22,3 +22,14 @@ def basic_write_file_mock(task: str, workspace: str) -> None: "file_to_check.txt", "Washington DC is the capital of the United States of America", ) + + +def basic_retrieval_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "25.89", + ) diff --git a/agbenchmark/mocks/workspace/file_to_check.txt b/agbenchmark/mocks/workspace/file_to_check.txt deleted file mode 100644 index 48dc8cff1..000000000 --- a/agbenchmark/mocks/workspace/file_to_check.txt +++ /dev/null @@ -1 +0,0 @@ -Washington DC is the capital of the United States of America \ No newline at end of file diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index c59e03ccf..966df7f2d 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,6 +1,7 @@ import os from pathlib import Path from typing import Any, Dict + import pytest from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge diff --git a/mypy.ini b/mypy.ini index 315ecae56..ceb13fcd2 100644 --- a/mypy.ini +++ b/mypy.ini @@ -3,3 +3,16 @@ follow_imports = skip check_untyped_defs = True disallow_untyped_defs = True exclude = ^(agent/.*\.py)$ +ignore_missing_imports = True + +[mypy-agbenchmark.mocks.mock_manager.*] +ignore_errors = True + +[mypy-agbenchmark.tests.basic_abilities.basic_challenge.*] +ignore_errors = True + +[mypy-agbenchmark.mocks.tests.basic_mocks.*] +ignore_errors = True + +[mypy-agbenchmark.tests.regression.RegressionManager.*] +ignore_errors = True diff --git a/regression_tests.json b/regression_tests.json index e3633a2af..9b998d115 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,7 +1,19 @@ { + "TestRetrieval": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" + }, "TestWriteFile": { "difficulty": "basic", "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "basic_write_file" + ], + "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" } } \ No newline at end of file -- cgit v1.2.3 From cef259c945a50c3e3884564c63da4a9a6f2abcd5 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 4 Jul 2023 17:34:55 -0700 Subject: Add pr template (#52) --- .github/PULL_REQUEST_TEMPLATE.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..ee5d8bf15 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,15 @@ +### Background + + +### Changes + + + +### PR Quality Checklist +- [ ] I have run the following commands against my code to ensure it passes our linters: + ```shell + black . + isort . + mypy . + autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark + ``` -- cgit v1.2.3 From e3c453f10e60f056ea8d8d28849264ab766d9c57 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 4 Jul 2023 17:41:13 -0700 Subject: Add information retrieval 3 (#54) Co-authored-by: Silen Naihin --- agbenchmark/challenges/retrieval/r2/r2_data.json | 21 +++++++++++++++++ agbenchmark/challenges/retrieval/r2/r2_test.py | 29 ++++++++++++++++++++++++ agbenchmark/challenges/retrieval/r3/r3_data.json | 21 +++++++++++++++++ agbenchmark/challenges/retrieval/r3/r3_test.py | 29 ++++++++++++++++++++++++ agbenchmark/mocks/tests/basic_mocks.py | 22 ++++++++++++++++++ regression_tests.json | 10 ++++++++ 6 files changed, 132 insertions(+) create mode 100644 agbenchmark/challenges/retrieval/r2/r2_data.json create mode 100644 agbenchmark/challenges/retrieval/r2/r2_test.py create mode 100644 agbenchmark/challenges/retrieval/r3/r3_data.json create mode 100644 agbenchmark/challenges/retrieval/r3/r3_test.py diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json new file mode 100644 index 000000000..925e6db83 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2/r2_data.json @@ -0,0 +1,21 @@ +{ + "name": "basic_information_retrieval", + "category": ["retrieval"], + "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "dependencies": [], + "ground": { + "answer": "81,462", + "should_contain": ["81,462"], + "should_not_contain": [], + "files": [".txt"] + }, + "mock": { + "mock_func": "basic_retrieval_2_mock", + "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "info": { + "difficulty": "basic", + "description": "Tests ability to retrieve information.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py new file mode 100644 index 000000000..bdc738868 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -0,0 +1,29 @@ +import os +from pathlib import Path +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge + + +class TestRetrieval2(RetrievalChallenge): + """The first information-retrieval challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "r2_data.json") + + @pytest.mark.depends(on=["basic_write_file"]) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + workspace = Path(os.getcwd()) / config["workspace"] + files_contents = self.open_files(workspace, self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json new file mode 100644 index 000000000..183529c48 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r3/r3_data.json @@ -0,0 +1,21 @@ +{ + "name": "basic_information_retrieval", + "category": ["retrieval"], + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "dependencies": [], + "ground": { + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"], + "should_not_contain": [], + "files": [".txt"] + }, + "mock": { + "mock_func": "basic_retrieval_3_mock", + "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "info": { + "difficulty": "basic", + "description": "Tests ability to retrieve information.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py new file mode 100644 index 000000000..36382b69b --- /dev/null +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -0,0 +1,29 @@ +import os +from pathlib import Path +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge + + +class TestRetrieval3(RetrievalChallenge): + """The first information-retrieval challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "r3_data.json") + + @pytest.mark.depends(on=["basic_write_file"]) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + workspace = Path(os.getcwd()) / config["workspace"] + files_contents = self.open_files(workspace, self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 07d8a6de0..882e3c829 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -33,3 +33,25 @@ def basic_retrieval_mock(task: str, workspace: str) -> None: "file_to_check.txt", "25.89", ) + + +def basic_retrieval_2_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "81,462", + ) + + +def basic_retrieval_3_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + ) diff --git a/regression_tests.json b/regression_tests.json index 9b998d115..853c38dcb 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -15,5 +15,15 @@ "basic_write_file" ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" + }, + "TestRetrieval2": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" + }, + "TestRetrieval3": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" } } \ No newline at end of file -- cgit v1.2.3 From 351131bbffa2956cde7beacc1a7a95451c895b19 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 4 Jul 2023 18:45:35 -0700 Subject: Change test dependencies (#55) --- agbenchmark/challenges/retrieval/r1/r1_test.py | 2 +- agbenchmark/challenges/retrieval/r2/r2_test.py | 2 +- agbenchmark/challenges/retrieval/r3/r3_test.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 914784960..767775340 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -13,7 +13,7 @@ class TestRetrieval(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") - @pytest.mark.depends(on=["basic_write_file"]) + @pytest.mark.depends(on=["basic_write_file"], name="test_retrieval") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py index bdc738868..7664ca36b 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -13,7 +13,7 @@ class TestRetrieval2(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r2_data.json") - @pytest.mark.depends(on=["basic_write_file"]) + @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py index 36382b69b..c13de2c86 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -13,7 +13,7 @@ class TestRetrieval3(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r3_data.json") - @pytest.mark.depends(on=["basic_write_file"]) + @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) -- cgit v1.2.3 From bfc7dfdb291099d75dcc1e0dbe3e03439b5163f5 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Tue, 4 Jul 2023 22:06:49 -0400 Subject: Dynamic workspace path (#56) --- agbenchmark/conftest.py | 23 ++++++++++++++++++++++- agent/mini-agi | 2 +- regression_tests.json | 2 +- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 613565fd2..66ede2c08 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -1,6 +1,7 @@ import json import os import shutil +from pathlib import Path # noqa from typing import Any, Dict, Generator, List import pytest @@ -9,6 +10,21 @@ from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH from agbenchmark.tests.regression.RegressionManager import RegressionManager +def get_dynamic_workspace(config: Dict[str, Any]) -> str: + # Extract the string inside ${...} + path_expr = config["workspace"][2:-1] + + # Check if it starts with "os.path.join" + if path_expr.strip().startswith("os.path.join"): + # Evaluate the path string + path_value = eval(path_expr) + + # Replace the original string with the evaluated result + return path_value + else: + raise ValueError("Invalid workspace path expression.") + + @pytest.fixture(scope="module") def config(request: Any) -> None: print(f"Config file: {CONFIG_PATH}") @@ -17,11 +33,16 @@ def config(request: Any) -> None: if request.config.getoption("--mock"): config["workspace"] = "agbenchmark/mocks/workspace" + elif config.get("workspace", "").startswith("${") and config.get( + "workspace", "" + ).endswith("}"): + path = get_dynamic_workspace(config) + config["workspace"] = path return config -@pytest.fixture(scope="module") +@pytest.fixture(scope="module", autouse=True) def workspace(config: Dict[str, Any]) -> Generator[str, None, None]: yield config["workspace"] # teardown after test function completes diff --git a/agent/mini-agi b/agent/mini-agi index d99220c05..4af8a7e60 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit d99220c058c9fb45f83256ea361b55dba506fa75 +Subproject commit 4af8a7e6085f0518f06180fbf87024a2c9db4c88 diff --git a/regression_tests.json b/regression_tests.json index 853c38dcb..d0a8ed19d 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -26,4 +26,4 @@ "dependencies": [], "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" } -} \ No newline at end of file +} -- cgit v1.2.3 From 74fc969dd60dd40f6b5ee8806ecc80fea50cb7e2 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 5 Jul 2023 20:32:28 -0700 Subject: Add basic memory challenge (#57) --- .gitignore | 2 ++ agbenchmark/agent_interface.py | 2 +- agbenchmark/challenge.py | 20 ++++++++++++++++ .../memory/m1/artifacts/instructions_1.txt | 2 ++ .../memory/m1/artifacts/instructions_2.txt | 1 + .../memory/m1/artifacts/instructions_3.txt | 1 + .../memory/m1/artifacts/instructions_4.txt | 1 + .../memory/m1/artifacts/instructions_5.txt | 1 + agbenchmark/challenges/memory/m1/m1_data.json | 21 +++++++++++++++++ agbenchmark/challenges/memory/m1/m1_test.py | 27 ++++++++++++++++++++++ agbenchmark/challenges/memory/m1_test.py | 0 agbenchmark/challenges/memory/memory.py | 8 +++++++ agbenchmark/challenges/retrieval/r1/r1_test.py | 6 ++--- agbenchmark/challenges/retrieval/r2/r2_test.py | 4 +--- agbenchmark/challenges/retrieval/r3/r3_test.py | 4 +--- agbenchmark/conftest.py | 7 +++--- agbenchmark/mocks/mock_manager.py | 6 ++--- agbenchmark/mocks/tests/basic_mocks.py | 11 +++++++++ agbenchmark/start_benchmark.py | 2 -- .../read_file/artifacts/file_to_check.txt | 1 + .../basic_abilities/read_file/r_file_data.json | 4 ++-- .../basic_abilities/read_file/read_file_test.py | 7 ------ .../basic_abilities/write_file/write_file_test.py | 4 +--- regression_tests.json | 17 +++++++++----- 24 files changed, 121 insertions(+), 38 deletions(-) create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m1/m1_data.json create mode 100644 agbenchmark/challenges/memory/m1/m1_test.py delete mode 100644 agbenchmark/challenges/memory/m1_test.py create mode 100644 agbenchmark/challenges/memory/memory.py create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt diff --git a/.gitignore b/.gitignore index c41065ca4..3581dc933 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +agbenchmark/mocks/workspace/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 993aa242a..4d74aac73 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -22,7 +22,7 @@ def run_agent( print("No mock provided") elif MOCK_FLAG == "True": mock_manager = MockManager( - task + task, config ) # workspace doesn't need to be passed in, stays the same print("Server unavailable, using mock", mock_func) mock_manager.delegate(mock_func) diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index eaed73a22..865d64444 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -1,5 +1,7 @@ import glob +import inspect import os +import shutil from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional @@ -45,6 +47,8 @@ class Challenge(ABC): def setup_challenge(self, config: Dict[str, Any]) -> None: from agbenchmark.agent_interface import run_agent + self.copy_artifacts_into_workspace(config["workspace"]) + run_agent(self.task, self.mock, config) @property @@ -124,3 +128,19 @@ class Challenge(ABC): ) return 1.0 + + def copy_artifacts_into_workspace(self, workspace: str) -> None: + curr_frame = inspect.currentframe() + outer_frame = inspect.getouterframes(curr_frame)[2] + caller_file_path = outer_frame.filename + caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path)) + source_dir = os.path.join(caller_dir_path, "artifacts") + + # Check if source_dir exists, if not then return immediately. + if not os.path.exists(source_dir): + return + + for file_name in os.listdir(source_dir): + full_file_name = os.path.join(source_dir, file_name) + if os.path.isfile(full_file_name): + shutil.copy(full_file_name, workspace) diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt new file mode 100644 index 000000000..da3babb1f --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt @@ -0,0 +1,2 @@ +The id to remember is 2314 +Read the file instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt new file mode 100644 index 000000000..87c870b1a --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt @@ -0,0 +1 @@ +Read the file instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt new file mode 100644 index 000000000..987086208 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt @@ -0,0 +1 @@ +Read the file instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt new file mode 100644 index 000000000..78da710b8 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt @@ -0,0 +1 @@ +Read the file instructions_5.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt new file mode 100644 index 000000000..8a2d35509 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt @@ -0,0 +1 @@ +Write the id previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json new file mode 100644 index 000000000..7023f85fc --- /dev/null +++ b/agbenchmark/challenges/memory/m1/m1_data.json @@ -0,0 +1,21 @@ +{ + "name": "basic_memory", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "2314", + "should_contain": ["2314"], + "should_not_contain": [], + "files": ["file_to_check.txt"] + }, + "mock": { + "mock_func": "basic_memory_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py new file mode 100644 index 000000000..28e600cc3 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/m1_test.py @@ -0,0 +1,27 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.memory.memory import MemoryChallenge + + +class TestBasicMemory(MemoryChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "m1_data.json") + + @pytest.mark.depends(name="test_basic_memory") + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.open_files(config["workspace"], self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/challenges/memory/m1_test.py b/agbenchmark/challenges/memory/m1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/memory/memory.py b/agbenchmark/challenges/memory/memory.py new file mode 100644 index 000000000..429bef23a --- /dev/null +++ b/agbenchmark/challenges/memory/memory.py @@ -0,0 +1,8 @@ +import pytest + +from agbenchmark.challenge import Challenge + + +@pytest.mark.memory +class MemoryChallenge(Challenge): + """Challenge for memory""" diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 767775340..d107d9645 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -13,12 +12,11 @@ class TestRetrieval(RetrievalChallenge): def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r1_data.json") - @pytest.mark.depends(on=["basic_write_file"], name="test_retrieval") + @pytest.mark.depends(name="test_retrieval") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py index 7664ca36b..a60296ecd 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -17,8 +16,7 @@ class TestRetrieval2(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py index c13de2c86..bcd48d33c 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -17,8 +16,7 @@ class TestRetrieval3(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 66ede2c08..7203ee6bb 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -31,14 +31,13 @@ def config(request: Any) -> None: with open(CONFIG_PATH, "r") as f: config = json.load(f) - if request.config.getoption("--mock"): - config["workspace"] = "agbenchmark/mocks/workspace" - elif config.get("workspace", "").startswith("${") and config.get( + if config.get("workspace", "").startswith("${") and config.get( "workspace", "" ).endswith("}"): path = get_dynamic_workspace(config) config["workspace"] = path - + else: + config["workspace"] = Path(os.getcwd()) / config["workspace"] return config diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py index 59fa8dbf1..5b84965c3 100644 --- a/agbenchmark/mocks/mock_manager.py +++ b/agbenchmark/mocks/mock_manager.py @@ -1,13 +1,13 @@ -from typing import Any +from typing import Any, Dict import agbenchmark.mocks.tests.basic_mocks as basic_mocks import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks class MockManager: - def __init__(self, task: str): + def __init__(self, task: str, config: Dict[str, Any]) -> None: self.task = task - self.workspace = "agbenchmark/mocks/workspace" + self.workspace = config["workspace"] self.modules = [basic_mocks, retrieval_mocks] def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None: diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 882e3c829..3b9170f4e 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -55,3 +55,14 @@ def basic_retrieval_3_mock(task: str, workspace: str) -> None: "file_to_check.txt", "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", ) + + +def basic_memory_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "2314", + ) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 8ef01d3c5..959dee361 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -56,8 +56,6 @@ def start(category: str, reg: bool, mock: bool) -> int: config = json.load(f) set_key(".env", "MOCK_TEST", "True" if mock else "False") - if mock: - config["workspace"] = "agbenchmark/mocks/workspace" # create workspace directory if it doesn't exist workspace_path = os.path.abspath(config["workspace"]) diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt new file mode 100644 index 000000000..980a0d5f1 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt @@ -0,0 +1 @@ +Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index b21e2724b..a74b875a8 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -4,8 +4,8 @@ "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["basic_write_file"], "ground": { - "answer": "random string: this is how we're doing", - "should_contain": ["random string: this is how we're doing"], + "answer": "random string: Hello World!", + "should_contain": ["random string: Hello World!"], "files": ["file_to_check.txt"] }, "mock": { diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index c5f886d52..e7f2af9ec 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -3,19 +3,12 @@ from typing import Any, Dict import pytest -from agbenchmark.challenge import Challenge from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" - @pytest.fixture(scope="module", autouse=True) - def setup_module(self, workspace: str) -> None: - Challenge.write_to_file( - workspace, self.data.ground.files[0], "this is how we're doing" - ) - def get_file_path(self) -> str: # all tests must implement this method return os.path.join(os.path.dirname(__file__), "r_file_data.json") diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 966df7f2d..81f72cc9c 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,5 +1,4 @@ import os -from pathlib import Path from typing import Any, Dict import pytest @@ -17,8 +16,7 @@ class TestWriteFile(BasicChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - workspace = Path(os.getcwd()) / config["workspace"] - files_contents = self.open_files(workspace, self.data.ground.files) + files_contents = self.open_files(config["workspace"], self.data.ground.files) scores = [] for file_content in files_contents: diff --git a/regression_tests.json b/regression_tests.json index d0a8ed19d..cfa4bda38 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,4 +1,9 @@ { + "TestBasicMemory": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m1/m1_test.py" + }, "TestRetrieval": { "difficulty": "basic", "dependencies": [], @@ -9,6 +14,11 @@ "dependencies": [], "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" }, + "TestRetrieval2": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" + }, "TestReadFile": { "difficulty": "basic", "dependencies": [ @@ -16,14 +26,9 @@ ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" }, - "TestRetrieval2": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" - }, "TestRetrieval3": { "difficulty": "basic", "dependencies": [], "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" } -} +} \ No newline at end of file -- cgit v1.2.3 From 7102fe1a182f3caed4f056600e9658d14031fe20 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 5 Jul 2023 21:03:45 -0700 Subject: Rename '--reg' flag to '--maintain' (#58) --- .github/workflows/ci.yml | 1 + agbenchmark/start_benchmark.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6fb348e9b..6a0f4503a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -106,3 +106,4 @@ jobs: run: | poetry install poetry run agbenchmark start --mock + poetry run agbenchmark start --mock --maintain diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 959dee361..9c7b8e8da 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -25,9 +25,9 @@ def cli() -> None: @cli.command() @click.option("--category", default=None, help="Specific category to run") -@click.option("--reg", is_flag=True, help="Runs only regression tests") +@click.option("--maintain", is_flag=True, help="Runs only regression tests") @click.option("--mock", is_flag=True, help="Run with mock") -def start(category: str, reg: bool, mock: bool) -> int: +def start(category: str, maintain: bool, mock: bool) -> int: """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" # Check if configuration file exists and is not empty if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: @@ -76,7 +76,7 @@ def start(category: str, reg: bool, mock: bool) -> int: if category: pytest_args.extend(["-m", category]) else: - if reg: + if maintain: print("Running all regression tests") tests_to_run = get_regression_tests() else: -- cgit v1.2.3 From e6f7bcf0ae6d115a8f0a7c35036792ac212ba9f9 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 6 Jul 2023 00:06:34 -0400 Subject: fixing --reg to --maintain workflow bug --- .github/workflows/autogpt.yml | 9 +++-- .github/workflows/gpt-engineer.yml | 8 ++--- .github/workflows/mini-agi.yml | 2 +- .github/workflows/smol-developer.yml | 8 ++--- .github/workflows/superagi.yml | 64 ++++++++++++++++++++++++++++++++++++ agent/SuperAGI | 1 + 6 files changed, 78 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/superagi.yml create mode 160000 agent/SuperAGI diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml index 4316f36ff..2d7e2dfbd 100644 --- a/.github/workflows/autogpt.yml +++ b/.github/workflows/autogpt.yml @@ -2,9 +2,9 @@ name: Auto-GPT Regression Test on: workflow_dispatch: - branches: [ master ] + branches: [master] push: - branches: [ stable, master, ci-test* ] + branches: [stable, master, ci-test*] jobs: regression-tests: @@ -15,7 +15,7 @@ jobs: timeout-minutes: 30 strategy: matrix: - python-version: ["3.10"] + python-version: ['3.10'] steps: - name: Checkout repository @@ -51,7 +51,6 @@ jobs: run: | poetry install --only main poetry build - - name: Run regression tests run: | @@ -60,6 +59,6 @@ jobs: cd agent/Auto-GPT pip install -r requirements.txt pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --reg + agbenchmark start --maintain env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml index e0dbac2f0..a39165482 100644 --- a/.github/workflows/gpt-engineer.yml +++ b/.github/workflows/gpt-engineer.yml @@ -2,9 +2,9 @@ name: gpt-engineer Regression Test on: workflow_dispatch: - branches: [ master ] + branches: [master] push: - branches: [ stable, master, ci-test* ] + branches: [stable, master, ci-test*] jobs: regression-tests: @@ -15,7 +15,7 @@ jobs: timeout-minutes: 30 strategy: matrix: - python-version: ["3.10"] + python-version: ['3.10'] steps: - name: Checkout repository @@ -58,7 +58,7 @@ jobs: make install source venv/bin/activate pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --reg + agbenchmark start --maintain env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml index 055df304d..53c479df4 100644 --- a/.github/workflows/mini-agi.yml +++ b/.github/workflows/mini-agi.yml @@ -60,7 +60,7 @@ jobs: pip install -r requirements.txt cp .env_example .env pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --reg + agbenchmark start --maintain env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PROMPT_USER: false diff --git a/.github/workflows/smol-developer.yml b/.github/workflows/smol-developer.yml index 13ee8cf8d..6926df54b 100644 --- a/.github/workflows/smol-developer.yml +++ b/.github/workflows/smol-developer.yml @@ -2,9 +2,9 @@ name: smol developer Regression Test on: workflow_dispatch: - branches: [ master ] + branches: [master] push: - branches: [ stable, master, ci-test* ] + branches: [stable, master, ci-test*] jobs: regression-tests: @@ -15,7 +15,7 @@ jobs: timeout-minutes: 30 strategy: matrix: - python-version: ["3.10"] + python-version: ['3.10'] steps: - name: Checkout repository @@ -59,6 +59,6 @@ jobs: source venv/bin/activate pip install -r requirements.txt pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --reg + agbenchmark start --maintain env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml new file mode 100644 index 000000000..88176cdc9 --- /dev/null +++ b/.github/workflows/superagi.yml @@ -0,0 +1,64 @@ +name: SuperAgi Regression Test + +on: + workflow_dispatch: + branches: [master] + push: + branches: [stable, master, ci-test*] + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ['3.10'] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + poetry install --only main + poetry build + + - name: Run regression tests + run: | + cd agent/SuperAgi + cp config_template.yaml config.yaml + sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --maintain diff --git a/agent/SuperAGI b/agent/SuperAGI new file mode 160000 index 000000000..166843799 --- /dev/null +++ b/agent/SuperAGI @@ -0,0 +1 @@ +Subproject commit 16684379930c770d3eb0ea00bd9f8d2630a1aa99 -- cgit v1.2.3 From de44d6ace51a229eff60d6d1965cdd18040e7d4d Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 6 Jul 2023 00:08:49 -0400 Subject: fix --- .github/workflows/superagi.yml | 64 ------------------------------------------ 1 file changed, 64 deletions(-) delete mode 100644 .github/workflows/superagi.yml diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml deleted file mode 100644 index 88176cdc9..000000000 --- a/.github/workflows/superagi.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: SuperAgi Regression Test - -on: - workflow_dispatch: - branches: [master] - push: - branches: [stable, master, ci-test*] - -jobs: - regression-tests: - permissions: - pull-requests: write - contents: write - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10'] - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - id: get_date - name: Get date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - - name: Set up venv and install Python dependencies - run: | - poetry install --only main - poetry build - - - name: Run regression tests - run: | - cd agent/SuperAgi - cp config_template.yaml config.yaml - sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml - python -m venv venv - source venv/bin/activate - pip install -r requirements.txt - pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --maintain -- cgit v1.2.3 From 4ebc5aa3b3b08bfa5710c9f2f8b28d737889c259 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 6 Jul 2023 00:14:40 -0400 Subject: submodule remove --- agent/SuperAGI | 1 - 1 file changed, 1 deletion(-) delete mode 160000 agent/SuperAGI diff --git a/agent/SuperAGI b/agent/SuperAGI deleted file mode 160000 index 166843799..000000000 --- a/agent/SuperAGI +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 16684379930c770d3eb0ea00bd9f8d2630a1aa99 -- cgit v1.2.3 From 5b19340f8e4cad6537d98b9a4d46e3635c762c1c Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 5 Jul 2023 21:35:15 -0700 Subject: Add 'Remember multiple ids' memory challenge (#59) --- .../memory/m2/artifacts/instructions_1.txt | 1 + .../memory/m2/artifacts/instructions_2.txt | 1 + .../memory/m2/artifacts/instructions_3.txt | 1 + .../memory/m2/artifacts/instructions_4.txt | 1 + .../memory/m2/artifacts/instructions_5.txt | 1 + .../memory/m2/remember_multiple_ids_data.json | 21 +++++++++++++++ .../memory/m2/remember_multiple_ids_test.py | 31 ++++++++++++++++++++++ agbenchmark/mocks/tests/basic_mocks.py | 11 ++++++++ regression_tests.json | 5 ++++ 9 files changed, 73 insertions(+) create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json create mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt new file mode 100644 index 000000000..d304ce632 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt @@ -0,0 +1 @@ +The id to remember is 3145. Read the file instructions_2.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt new file mode 100644 index 000000000..cfff66ba2 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt @@ -0,0 +1 @@ +The id to remember is 3791. Read the file instructions_3.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt new file mode 100644 index 000000000..ad16b6fdc --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt @@ -0,0 +1 @@ +The id to remember is 9317. Read the file instructions_4.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt new file mode 100644 index 000000000..2394d64bc --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt @@ -0,0 +1 @@ +The id to remember is 9471. Read the file instructions_5.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt new file mode 100644 index 000000000..92a6fba94 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt @@ -0,0 +1 @@ +Write the ids previously mentioned to a file called file_to_check.txt. diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json new file mode 100644 index 000000000..374df6165 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json @@ -0,0 +1,21 @@ +{ + "name": "remember_multiple_ids", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "3145\n3791\n9317\n9471", + "should_contain": ["3145", "3791", "9317", "9471"], + "should_not_contain": [], + "files": ["file_to_check.txt"] + }, + "mock": { + "mock_func": "remember_multiple_ids_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py new file mode 100644 index 000000000..d5f0cf1a9 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py @@ -0,0 +1,31 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.memory.memory import MemoryChallenge + + +class TestRememberMultipleIds(MemoryChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join( + os.path.dirname(__file__), "remember_multiple_ids_data.json" + ) + + @pytest.mark.depends( + name="test_remember_multiple_ids", depends=["test_basic_memory"] + ) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.open_files(config["workspace"], self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 3b9170f4e..1ffb3de39 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -66,3 +66,14 @@ def basic_memory_mock(task: str, workspace: str) -> None: "file_to_check.txt", "2314", ) + + +def remember_multiple_ids_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "3145\n3791\n9317\n9471", + ) diff --git a/regression_tests.json b/regression_tests.json index cfa4bda38..9742aa47e 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -30,5 +30,10 @@ "difficulty": "basic", "dependencies": [], "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" + }, + "TestRememberMultipleIds": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" } } \ No newline at end of file -- cgit v1.2.3 From c76062b0924543e70feb0d6b621cf642c987df51 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 6 Jul 2023 00:38:01 -0400 Subject: Added caching based on file key (#62) Co-authored-by: merwanehamadi --- agbenchmark/challenge.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index 865d64444..dee2b435e 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -20,6 +20,8 @@ class Challenge(ABC): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" + _data_cache: Dict[str, ChallengeData] = {} + @abstractmethod def get_file_path(self) -> str: """This should be implemented by any class which inherits from BasicChallenge""" @@ -27,8 +29,13 @@ class Challenge(ABC): @property def data(self) -> ChallengeData: - # TODO: make it so that this is cached somewhere to just call self.deserialized_data - return ChallengeData.deserialize(self.get_file_path()) + "Check if the data is already loaded, if not load it" + file_path = ( + self.get_file_path() + ) # file_path serves as the key in the cache dictionary + if file_path not in Challenge._data_cache: + Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path) + return Challenge._data_cache[file_path] @property def mock(self) -> Optional[str]: -- cgit v1.2.3 From 82d8f67f6ab62989469205230ac7a6668e3c7407 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 5 Jul 2023 22:34:51 -0700 Subject: Add 'remember ids with noise' challenge (#61) --- .../memory/m3/artifacts/instructions_1.txt | 5 ++++ .../memory/m3/artifacts/instructions_2.txt | 5 ++++ .../memory/m3/artifacts/instructions_3.txt | 5 ++++ .../memory/m3/artifacts/instructions_4.txt | 5 ++++ .../memory/m3/artifacts/instructions_5.txt | 5 ++++ .../m3/remember_multiple_ids_with_noise_data.json | 21 ++++++++++++++ .../m3/remember_multiple_ids_with_noise_test.py | 32 ++++++++++++++++++++++ regression_tests.json | 25 ++++++++++------- 8 files changed, 93 insertions(+), 10 deletions(-) create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json create mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt new file mode 100644 index 000000000..8deee1b09 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt @@ -0,0 +1,5 @@ +xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk + +The id to remember is 3145. Read the file instructions_2.txt. + +OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt new file mode 100644 index 000000000..c9b5fc5f9 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt @@ -0,0 +1,5 @@ +2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 + +The id to remember is 3791. Read the file instructions_3.txt. + +BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt new file mode 100644 index 000000000..a564a7c91 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt @@ -0,0 +1,5 @@ +ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx + +The id to remember is 9317. Read the file instructions_4.txt. + +8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt new file mode 100644 index 000000000..63fcd54fa --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt @@ -0,0 +1,5 @@ +RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA + +The id to remember is 9471. Read the file instructions_5.txt. + +1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt new file mode 100644 index 000000000..1c3f156e3 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt @@ -0,0 +1,5 @@ +sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 + +Write the ids previously mentioned to a file called file_to_check.txt. + +gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json new file mode 100644 index 000000000..95c93ccb9 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json @@ -0,0 +1,21 @@ +{ + "name": "remember_multiple_ids_with_noise_mock", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "3145\n3791\n9317\n9471", + "should_contain": ["3145", "3791", "9317", "9471"], + "should_not_contain": [], + "files": ["file_to_check.txt"] + }, + "mock": { + "mock_func": "remember_multiple_ids_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "medium", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py new file mode 100644 index 000000000..4d2d64957 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py @@ -0,0 +1,32 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.memory.memory import MemoryChallenge + + +class TestRememberMultipleIdsWithNoise(MemoryChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join( + os.path.dirname(__file__), "remember_multiple_ids_with_noise_data.json" + ) + + @pytest.mark.depends( + name="test_remember_multiple_ids_with_noise", + depends=["test_remember_multiple_ids"], + ) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.open_files(config["workspace"], self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/regression_tests.json b/regression_tests.json index 9742aa47e..3b91a5c90 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -4,11 +4,6 @@ "dependencies": [], "test": "agbenchmark/challenges/memory/m1/m1_test.py" }, - "TestRetrieval": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" - }, "TestWriteFile": { "difficulty": "basic", "dependencies": [], @@ -19,6 +14,16 @@ "dependencies": [], "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" }, + "TestRetrieval3": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" + }, + "TestRetrieval": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" + }, "TestReadFile": { "difficulty": "basic", "dependencies": [ @@ -26,14 +31,14 @@ ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" }, - "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" - }, "TestRememberMultipleIds": { "difficulty": "basic", "dependencies": [], "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" } } \ No newline at end of file -- cgit v1.2.3 From 0b4ae5ea78cc10506cfea863ff0cd9bea4f3575e Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Thu, 6 Jul 2023 14:19:12 -0700 Subject: Add 'remember phrases with noise' challenge (#63) --- .../memory/m4/artifacts/instructions_1.txt | 5 ++++ .../memory/m4/artifacts/instructions_2.txt | 5 ++++ .../memory/m4/artifacts/instructions_3.txt | 5 ++++ .../memory/m4/artifacts/instructions_4.txt | 5 ++++ .../memory/m4/artifacts/instructions_5.txt | 5 ++++ .../remember_multiple_phrases_with_noise_data.json | 26 ++++++++++++++++++ .../remember_multiple_phrases_with_noise_test.py | 32 ++++++++++++++++++++++ agbenchmark/mocks/tests/basic_mocks.py | 11 ++++++++ regression_tests.json | 5 ++++ 9 files changed, 99 insertions(+) create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json create mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt new file mode 100644 index 000000000..1b1e0147c --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt @@ -0,0 +1,5 @@ +xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk + +The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt. + +OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt new file mode 100644 index 000000000..92203f5c1 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt @@ -0,0 +1,5 @@ +2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 + +The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt. + +BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt new file mode 100644 index 000000000..d7f6f08fc --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt @@ -0,0 +1,5 @@ +ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx + +The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt. + +8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt new file mode 100644 index 000000000..6c0a13bae --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt @@ -0,0 +1,5 @@ +RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA + +The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt. + +1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt new file mode 100644 index 000000000..729f0aa0d --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt @@ -0,0 +1,5 @@ +sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 + +Write the phrases previously mentioned to a file called file_to_check.txt. + +gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json new file mode 100644 index 000000000..e1ecb16f4 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json @@ -0,0 +1,26 @@ +{ + "name": "remember_multiple_phrases_with_noise_mock", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "should_contain": [ + "The purple elephant danced on a rainbow while eating a taco", + "The sneaky toaster stole my socks and ran away to Hawaii", + "My pet rock sings better than Beyoncé on Tuesdays", + "The giant hamster rode a unicycle through the crowded mall" + ], + "should_not_contain": [], + "files": ["file_to_check.txt"] + }, + "mock": { + "mock_func": "remember_multiple_phrases_with_noise_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "medium", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py new file mode 100644 index 000000000..fd33da1c0 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py @@ -0,0 +1,32 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.memory.memory import MemoryChallenge + + +class TestRememberMultiplePhrasesWithNoise(MemoryChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join( + os.path.dirname(__file__), "remember_multiple_phrases_with_noise_data.json" + ) + + @pytest.mark.depends( + name="test_remember_multiple_phrases_with_noise", + depends=["test_remember_multiple_ids_with_noise"], + ) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.open_files(config["workspace"], self.data.ground.files) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 1ffb3de39..37ded0ae9 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -77,3 +77,14 @@ def remember_multiple_ids_mock(task: str, workspace: str) -> None: "file_to_check.txt", "3145\n3791\n9317\n9471", ) + + +def remember_multiple_phrases_with_noise_mock(task: str, workspace: str) -> None: + """ + This mock writes to a file (creates one if it doesn't exist) + """ + Challenge.write_to_file( + workspace, + "file_to_check.txt", + "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + ) diff --git a/regression_tests.json b/regression_tests.json index 3b91a5c90..1195efbc9 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -40,5 +40,10 @@ "difficulty": "medium", "dependencies": [], "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" } } \ No newline at end of file -- cgit v1.2.3 From bfd0d5c826b3854c25b9db1f548315c74592b68d Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Thu, 6 Jul 2023 21:00:45 -0400 Subject: Fix home_path, local mini-agi run works (#64) Co-authored-by: merwanehamadi --- agbenchmark/agent_interface.py | 4 ++-- agbenchmark/start_benchmark.py | 5 ----- config.json | 4 ++-- pyproject.toml | 1 + 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 4d74aac73..8e9e5a14c 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -33,11 +33,10 @@ def run_agent( ) # Get the current working directory - cwd = os.getcwd() + cwd = os.path.join(os.getcwd(), config["home_path"]) # Add current directory to Python's import path sys.path.append(cwd) - sys.path.append(os.path.join(cwd, config["home_path"])) command = [sys.executable, config["entry_path"], str(task)] process = subprocess.Popen( @@ -67,6 +66,7 @@ def run_agent( print( "The Python function has exceeded the time limit and was terminated." ) + # Terminate the process group process.terminate() break diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 9c7b8e8da..295bbf4bf 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -57,11 +57,6 @@ def start(category: str, maintain: bool, mock: bool) -> int: set_key(".env", "MOCK_TEST", "True" if mock else "False") - # create workspace directory if it doesn't exist - workspace_path = os.path.abspath(config["workspace"]) - if not os.path.exists(workspace_path): - os.makedirs(workspace_path, exist_ok=True) - if not os.path.exists(REGRESSION_TESTS_PATH): with open(REGRESSION_TESTS_PATH, "a"): pass diff --git a/config.json b/config.json index ba2ec0b80..378e69025 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { - "workspace": "projects/my-new-project/workspace", + "workspace": "${os.path.join(Path.home(), 'miniagi')}", "entry_path": "benchmarks.py", - "home_path": "", + "home_path": "agent/mini-agi/", "cutoff": 60 } diff --git a/pyproject.toml b/pyproject.toml index 7e95969af..e0d579cab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ markers = [ "retrieval", "regression", "basic", + "memory" ] [tool.poetry.scripts] -- cgit v1.2.3 From 9ede17891bb4a322d51ec2bf1cc9e60e93db0acd Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 7 Jul 2023 13:50:53 -0700 Subject: Add 'Debug simple typo with guidance' challenge (#65) Signed-off-by: Merwane Hamadi --- agbenchmark/agent_interface.py | 31 +++++++++-- agbenchmark/challenge.py | 62 ++++++++++++---------- agbenchmark/challenges/README.md | 3 +- agbenchmark/challenges/code/c1_test.py | 0 agbenchmark/challenges/code/code.py | 8 +++ .../challenges/code/d1/artifacts_in/__init__.py | 0 .../challenges/code/d1/artifacts_in/code.py | 13 +++++ .../challenges/code/d1/artifacts_in/test.py | 31 +++++++++++ .../challenges/code/d1/artifacts_out/__init__.py | 0 .../challenges/code/d1/artifacts_out/code.py | 12 +++++ .../challenges/code/d1/artifacts_out/test.py | 31 +++++++++++ .../d1/debug_simple_typo_with_guidance_data.json | 22 ++++++++ .../d1/debug_simple_typo_with_guidance_test.py | 31 +++++++++++ agbenchmark/challenges/define_task_types.py | 3 +- .../memory/m1/artifacts/instructions_1.txt | 2 - .../memory/m1/artifacts/instructions_2.txt | 1 - .../memory/m1/artifacts/instructions_3.txt | 1 - .../memory/m1/artifacts/instructions_4.txt | 1 - .../memory/m1/artifacts/instructions_5.txt | 1 - .../memory/m1/artifacts_in/instructions_1.txt | 2 + .../memory/m1/artifacts_in/instructions_2.txt | 1 + .../memory/m1/artifacts_in/instructions_3.txt | 1 + .../memory/m1/artifacts_in/instructions_4.txt | 1 + .../memory/m1/artifacts_in/instructions_5.txt | 1 + agbenchmark/challenges/memory/m1/m1_data.json | 3 +- agbenchmark/challenges/memory/m1/m1_test.py | 4 +- .../memory/m2/artifacts/instructions_1.txt | 1 - .../memory/m2/artifacts/instructions_2.txt | 1 - .../memory/m2/artifacts/instructions_3.txt | 1 - .../memory/m2/artifacts/instructions_4.txt | 1 - .../memory/m2/artifacts/instructions_5.txt | 1 - .../memory/m2/artifacts_in/instructions_1.txt | 1 + .../memory/m2/artifacts_in/instructions_2.txt | 1 + .../memory/m2/artifacts_in/instructions_3.txt | 1 + .../memory/m2/artifacts_in/instructions_4.txt | 1 + .../memory/m2/artifacts_in/instructions_5.txt | 1 + .../memory/m2/remember_multiple_ids_data.json | 3 +- .../memory/m2/remember_multiple_ids_test.py | 4 +- .../memory/m3/artifacts/instructions_1.txt | 5 -- .../memory/m3/artifacts/instructions_2.txt | 5 -- .../memory/m3/artifacts/instructions_3.txt | 5 -- .../memory/m3/artifacts/instructions_4.txt | 5 -- .../memory/m3/artifacts/instructions_5.txt | 5 -- .../memory/m3/artifacts_in/instructions_1.txt | 5 ++ .../memory/m3/artifacts_in/instructions_2.txt | 5 ++ .../memory/m3/artifacts_in/instructions_3.txt | 5 ++ .../memory/m3/artifacts_in/instructions_4.txt | 5 ++ .../memory/m3/artifacts_in/instructions_5.txt | 5 ++ .../m3/remember_multiple_ids_with_noise_data.json | 3 +- .../m3/remember_multiple_ids_with_noise_test.py | 4 +- .../memory/m4/artifacts/instructions_1.txt | 5 -- .../memory/m4/artifacts/instructions_2.txt | 5 -- .../memory/m4/artifacts/instructions_3.txt | 5 -- .../memory/m4/artifacts/instructions_4.txt | 5 -- .../memory/m4/artifacts/instructions_5.txt | 5 -- .../memory/m4/artifacts_in/instructions_1.txt | 5 ++ .../memory/m4/artifacts_in/instructions_2.txt | 5 ++ .../memory/m4/artifacts_in/instructions_3.txt | 5 ++ .../memory/m4/artifacts_in/instructions_4.txt | 5 ++ .../memory/m4/artifacts_in/instructions_5.txt | 5 ++ .../remember_multiple_phrases_with_noise_data.json | 3 +- .../remember_multiple_phrases_with_noise_test.py | 4 +- agbenchmark/challenges/retrieval/r1/r1_data.json | 3 +- agbenchmark/challenges/retrieval/r1/r1_test.py | 4 +- agbenchmark/challenges/retrieval/r2/r2_data.json | 3 +- agbenchmark/challenges/retrieval/r2/r2_test.py | 4 +- agbenchmark/challenges/retrieval/r3/r3_data.json | 3 +- agbenchmark/challenges/retrieval/r3/r3_test.py | 4 +- agbenchmark/mocks/tests/basic_mocks.py | 12 ----- .../read_file/artifacts/file_to_check.txt | 1 - .../read_file/artifacts_in/file_to_check.txt | 1 + .../read_file/artifacts_out/file_to_check.txt | 1 + .../basic_abilities/read_file/r_file_data.json | 7 +-- .../basic_abilities/read_file/read_file_test.py | 4 +- .../basic_abilities/write_file/w_file_data.json | 3 +- .../basic_abilities/write_file/write_file_test.py | 4 +- pyproject.toml | 1 + regression_tests.json | 45 +++++++++------- 78 files changed, 350 insertions(+), 147 deletions(-) delete mode 100644 agbenchmark/challenges/code/c1_test.py create mode 100644 agbenchmark/challenges/code/code.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/__init__.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/code.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/test.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/test.py create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 8e9e5a14c..05540f6d3 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -1,4 +1,5 @@ import os +import shutil import subprocess import sys import time @@ -14,13 +15,20 @@ MOCK_FLAG = os.getenv("MOCK_TEST") def run_agent( - task: Optional[str], mock_func: Optional[str], config: Dict[str, Any] + task: Optional[str], + mock_func: Optional[str], + config: Dict[str, Any], + challenge_location: str, ) -> None: """Calling to get a response""" - if mock_func == None and MOCK_FLAG == "True": - print("No mock provided") - elif MOCK_FLAG == "True": + if MOCK_FLAG == "True": + copy_artifacts_into_workspace( + config["workspace"], "artifacts_out", challenge_location + ) + if mock_func is None: + print("No mock provided") + return mock_manager = MockManager( task, config ) # workspace doesn't need to be passed in, stays the same @@ -77,4 +85,19 @@ def run_agent( process.wait() +def copy_artifacts_into_workspace( + workspace: str, artifact_folder_name: str, challenge_dir_path: str +) -> None: + source_dir = os.path.join(challenge_dir_path, artifact_folder_name) + + # Check if source_dir exists, if not then return immediately. + if not os.path.exists(source_dir): + return + + for file_name in os.listdir(source_dir): + full_file_name = os.path.join(source_dir, file_name) + if os.path.isfile(full_file_name): + shutil.copy(full_file_name, workspace) + + ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index dee2b435e..4c8e69848 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -1,9 +1,10 @@ import glob import inspect import os -import shutil -from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional +import subprocess +import types +from abc import ABC, ABCMeta, abstractmethod +from typing import Any, Dict, List, Optional, Tuple, Type, cast import pytest from dotenv import load_dotenv @@ -16,7 +17,20 @@ mock_test_str = os.getenv("MOCK_TEST") MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False -class Challenge(ABC): +class ChallengeMeta(ABCMeta): + def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None: + + super().__init__(name, bases, dct) + try: + frame = cast(types.FrameType, inspect.currentframe()) + assert frame.f_back is not None + self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back)) + except Exception as e: + print(f"Unable to get the file from 8 frames back due to: {str(e)}") + raise e + + +class Challenge(ABC, metaclass=ChallengeMeta): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" @@ -52,11 +66,13 @@ class Challenge(ABC): return self.data.dependencies def setup_challenge(self, config: Dict[str, Any]) -> None: - from agbenchmark.agent_interface import run_agent + from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent - self.copy_artifacts_into_workspace(config["workspace"]) + copy_artifacts_into_workspace( + config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION + ) - run_agent(self.task, self.mock, config) + run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION) @property def name(self) -> str: @@ -77,8 +93,7 @@ class Challenge(ABC): with open(workspace_dir, "r") as f: return f.read() - @staticmethod - def open_files(workspace: str, file_patterns: list) -> List[str]: + def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]: script_dir = os.path.abspath(workspace) files_contents = [] @@ -92,8 +107,17 @@ class Challenge(ABC): matching_files = [os.path.join(script_dir, file_pattern)] for file_path in matching_files: - with open(file_path, "r") as f: - files_contents.append(f.read()) + if self.data.ground.type == "execute_python_code": + result = subprocess.run( + ["python3", file_path], + cwd=os.path.abspath(workspace), + capture_output=True, + text=True, + ) + files_contents.append(result.stdout) + else: + with open(file_path, "r") as f: + files_contents.append(f.read()) return files_contents @@ -135,19 +159,3 @@ class Challenge(ABC): ) return 1.0 - - def copy_artifacts_into_workspace(self, workspace: str) -> None: - curr_frame = inspect.currentframe() - outer_frame = inspect.getouterframes(curr_frame)[2] - caller_file_path = outer_frame.filename - caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path)) - source_dir = os.path.join(caller_dir_path, "artifacts") - - # Check if source_dir exists, if not then return immediately. - if not os.path.exists(source_dir): - return - - for file_name in os.listdir(source_dir): - full_file_name = os.path.join(source_dir, file_name) - if os.path.isfile(full_file_name): - shutil.copy(full_file_name, workspace) diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 9e74d19ce..2d782d1fc 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -33,7 +33,8 @@ Example: "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_write_file_mock", diff --git a/agbenchmark/challenges/code/c1_test.py b/agbenchmark/challenges/code/c1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py new file mode 100644 index 000000000..508d24a90 --- /dev/null +++ b/agbenchmark/challenges/code/code.py @@ -0,0 +1,8 @@ +import pytest + +from agbenchmark.challenge import Challenge + + +@pytest.mark.code +class CodeChallenge(Challenge): + """Challenge for memory""" diff --git a/agbenchmark/challenges/code/d1/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1/artifacts_in/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1/artifacts_in/code.py new file mode 100644 index 000000000..df8120bfa --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_in/code.py @@ -0,0 +1,13 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + typo + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1/artifacts_in/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_in/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1/artifacts_out/code.py new file mode 100644 index 000000000..de3d8c62c --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1/artifacts_out/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d1/artifacts_out/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json new file mode 100644 index 000000000..ce9d92987 --- /dev/null +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json @@ -0,0 +1,22 @@ +{ + "name": "debug_simple_typo_with_guidance", + "category": ["code"], + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "dependencies": [], + "ground": { + "answer": "2314", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "mock": { + "mock_func": null, + "mock_task": null + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py new file mode 100644 index 000000000..e5f50c700 --- /dev/null +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py @@ -0,0 +1,31 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.code.code import CodeChallenge + + +class TestDebugSimpleTypoWithGuidance(CodeChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join( + os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json" + ) + + @pytest.mark.depends(name="test_debug_simple_typo_with_guidance") + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 52df3017b..f84df1262 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -5,7 +5,7 @@ from pydantic import BaseModel class Mock(BaseModel): - mock_func: str + mock_func: Optional[str] = None mock_task: Optional[str] = None @@ -20,6 +20,7 @@ class Ground(BaseModel): should_contain: Optional[List[str]] = None should_not_contain: Optional[List[str]] = None files: List[str] + type: str class ChallengeData(BaseModel): diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt deleted file mode 100644 index da3babb1f..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt +++ /dev/null @@ -1,2 +0,0 @@ -The id to remember is 2314 -Read the file instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt deleted file mode 100644 index 87c870b1a..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt +++ /dev/null @@ -1 +0,0 @@ -Read the file instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt deleted file mode 100644 index 987086208..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt +++ /dev/null @@ -1 +0,0 @@ -Read the file instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt deleted file mode 100644 index 78da710b8..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt +++ /dev/null @@ -1 +0,0 @@ -Read the file instructions_5.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt deleted file mode 100644 index 8a2d35509..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt +++ /dev/null @@ -1 +0,0 @@ -Write the id previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..da3babb1f --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt @@ -0,0 +1,2 @@ +The id to remember is 2314 +Read the file instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..87c870b1a --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt @@ -0,0 +1 @@ +Read the file instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..987086208 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt @@ -0,0 +1 @@ +Read the file instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..78da710b8 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt @@ -0,0 +1 @@ +Read the file instructions_5.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..8a2d35509 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt @@ -0,0 +1 @@ +Write the id previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json index 7023f85fc..3e410ac53 100644 --- a/agbenchmark/challenges/memory/m1/m1_data.json +++ b/agbenchmark/challenges/memory/m1/m1_data.json @@ -7,7 +7,8 @@ "answer": "2314", "should_contain": ["2314"], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "basic_memory_mock", diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py index 28e600cc3..c1f370244 100644 --- a/agbenchmark/challenges/memory/m1/m1_test.py +++ b/agbenchmark/challenges/memory/m1/m1_test.py @@ -16,7 +16,9 @@ class TestBasicMemory(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt deleted file mode 100644 index d304ce632..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 3145. Read the file instructions_2.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt deleted file mode 100644 index cfff66ba2..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 3791. Read the file instructions_3.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt deleted file mode 100644 index ad16b6fdc..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 9317. Read the file instructions_4.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt deleted file mode 100644 index 2394d64bc..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 9471. Read the file instructions_5.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt deleted file mode 100644 index 92a6fba94..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt +++ /dev/null @@ -1 +0,0 @@ -Write the ids previously mentioned to a file called file_to_check.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..d304ce632 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt @@ -0,0 +1 @@ +The id to remember is 3145. Read the file instructions_2.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..cfff66ba2 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt @@ -0,0 +1 @@ +The id to remember is 3791. Read the file instructions_3.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..ad16b6fdc --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt @@ -0,0 +1 @@ +The id to remember is 9317. Read the file instructions_4.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..2394d64bc --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt @@ -0,0 +1 @@ +The id to remember is 9471. Read the file instructions_5.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..92a6fba94 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt @@ -0,0 +1 @@ +Write the ids previously mentioned to a file called file_to_check.txt. diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json index 374df6165..29d7339b8 100644 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json @@ -7,7 +7,8 @@ "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "remember_multiple_ids_mock", diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py index d5f0cf1a9..f0f2b3971 100644 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py @@ -20,7 +20,9 @@ class TestRememberMultipleIds(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt deleted file mode 100644 index 8deee1b09..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt +++ /dev/null @@ -1,5 +0,0 @@ -xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk - -The id to remember is 3145. Read the file instructions_2.txt. - -OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt deleted file mode 100644 index c9b5fc5f9..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt +++ /dev/null @@ -1,5 +0,0 @@ -2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 - -The id to remember is 3791. Read the file instructions_3.txt. - -BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt deleted file mode 100644 index a564a7c91..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt +++ /dev/null @@ -1,5 +0,0 @@ -ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx - -The id to remember is 9317. Read the file instructions_4.txt. - -8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt deleted file mode 100644 index 63fcd54fa..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt +++ /dev/null @@ -1,5 +0,0 @@ -RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA - -The id to remember is 9471. Read the file instructions_5.txt. - -1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt deleted file mode 100644 index 1c3f156e3..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt +++ /dev/null @@ -1,5 +0,0 @@ -sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 - -Write the ids previously mentioned to a file called file_to_check.txt. - -gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..8deee1b09 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt @@ -0,0 +1,5 @@ +xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk + +The id to remember is 3145. Read the file instructions_2.txt. + +OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..c9b5fc5f9 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt @@ -0,0 +1,5 @@ +2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 + +The id to remember is 3791. Read the file instructions_3.txt. + +BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..a564a7c91 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt @@ -0,0 +1,5 @@ +ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx + +The id to remember is 9317. Read the file instructions_4.txt. + +8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..63fcd54fa --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt @@ -0,0 +1,5 @@ +RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA + +The id to remember is 9471. Read the file instructions_5.txt. + +1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..1c3f156e3 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt @@ -0,0 +1,5 @@ +sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 + +Write the ids previously mentioned to a file called file_to_check.txt. + +gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json index 95c93ccb9..6b53c4571 100644 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json @@ -7,7 +7,8 @@ "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "remember_multiple_ids_mock", diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py index 4d2d64957..493ea3574 100644 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py @@ -21,7 +21,9 @@ class TestRememberMultipleIdsWithNoise(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt deleted file mode 100644 index 1b1e0147c..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt +++ /dev/null @@ -1,5 +0,0 @@ -xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk - -The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt. - -OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt deleted file mode 100644 index 92203f5c1..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt +++ /dev/null @@ -1,5 +0,0 @@ -2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 - -The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt. - -BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt deleted file mode 100644 index d7f6f08fc..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt +++ /dev/null @@ -1,5 +0,0 @@ -ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx - -The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt. - -8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt deleted file mode 100644 index 6c0a13bae..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt +++ /dev/null @@ -1,5 +0,0 @@ -RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA - -The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt. - -1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt deleted file mode 100644 index 729f0aa0d..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt +++ /dev/null @@ -1,5 +0,0 @@ -sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 - -Write the phrases previously mentioned to a file called file_to_check.txt. - -gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..1b1e0147c --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt @@ -0,0 +1,5 @@ +xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk + +The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt. + +OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..92203f5c1 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt @@ -0,0 +1,5 @@ +2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 + +The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt. + +BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..d7f6f08fc --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt @@ -0,0 +1,5 @@ +ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx + +The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt. + +8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..6c0a13bae --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt @@ -0,0 +1,5 @@ +RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA + +The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt. + +1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..729f0aa0d --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt @@ -0,0 +1,5 @@ +sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 + +Write the phrases previously mentioned to a file called file_to_check.txt. + +gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json index e1ecb16f4..316ef9476 100644 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json @@ -12,7 +12,8 @@ "The giant hamster rode a unicycle through the crowded mall" ], "should_not_contain": [], - "files": ["file_to_check.txt"] + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "remember_multiple_phrases_with_noise_mock", diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py index fd33da1c0..e37e9a385 100644 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py @@ -21,7 +21,9 @@ class TestRememberMultiplePhrasesWithNoise(MemoryChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json index 44fce24ea..8fca01b78 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ b/agbenchmark/challenges/retrieval/r1/r1_data.json @@ -7,7 +7,8 @@ "answer": "£25.89", "should_contain": ["25.89"], "should_not_contain": [], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_retrieval_mock", diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index d107d9645..285b8affc 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -16,7 +16,9 @@ class TestRetrieval(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json index 925e6db83..3c388f192 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_data.json +++ b/agbenchmark/challenges/retrieval/r2/r2_data.json @@ -7,7 +7,8 @@ "answer": "81,462", "should_contain": ["81,462"], "should_not_contain": [], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_retrieval_2_mock", diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py index a60296ecd..ba727b8ed 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -16,7 +16,9 @@ class TestRetrieval2(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json index 183529c48..415456155 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_data.json +++ b/agbenchmark/challenges/retrieval/r3/r3_data.json @@ -7,7 +7,8 @@ "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"], "should_not_contain": [], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_retrieval_3_mock", diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py index bcd48d33c..b58f42672 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -16,7 +16,9 @@ class TestRetrieval3(RetrievalChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 37ded0ae9..32149eb83 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -1,18 +1,6 @@ from agbenchmark.challenge import Challenge -def basic_read_file_mock(task: str, workspace: str) -> None: - """ - This mock reads a file and returns its content. - """ - - file_contents = Challenge.open_file(workspace, "file_to_check.txt") - - Challenge.write_to_file( - workspace, "file_to_check.txt", f"random string: {file_contents}" - ) - - def basic_write_file_mock(task: str, workspace: str) -> None: """ This mock writes to a file (creates one if it doesn't exist) diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt deleted file mode 100644 index 980a0d5f1..000000000 --- a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt new file mode 100644 index 000000000..980a0d5f1 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt @@ -0,0 +1 @@ +Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt new file mode 100644 index 000000000..c1a7879a1 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt @@ -0,0 +1 @@ +random string Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json index a74b875a8..7463d22fc 100644 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json @@ -4,9 +4,10 @@ "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["basic_write_file"], "ground": { - "answer": "random string: Hello World!", - "should_contain": ["random string: Hello World!"], - "files": ["file_to_check.txt"] + "answer": "random string Hello World!", + "should_contain": ["random string", "Hello World!"], + "files": ["file_to_check.txt"], + "type": "file" }, "mock": { "mock_func": "basic_read_file_mock" diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index e7f2af9ec..7c38d2832 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -15,7 +15,9 @@ class TestReadFile(BasicChallenge): @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json index 358ebb538..9232a45a0 100644 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json @@ -7,7 +7,8 @@ "answer": "Washington", "should_contain": ["Washington"], "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": [".txt"] + "files": [".txt"], + "type": "file" }, "mock": { "mock_func": "basic_write_file_mock", diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 81f72cc9c..474d67127 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -16,7 +16,9 @@ class TestWriteFile(BasicChallenge): def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.open_files(config["workspace"], self.data.ground.files) + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) scores = [] for file_content in files_contents: diff --git a/pyproject.toml b/pyproject.toml index e0d579cab..33a8671cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ markers = [ "retrieval", "regression", "basic", + "code", "memory" ] diff --git a/regression_tests.json b/regression_tests.json index 1195efbc9..3c8988a1b 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,9 +1,34 @@ { + "TestDebugSimpleTypoWithGuidance": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py" + }, "TestBasicMemory": { "difficulty": "basic", "dependencies": [], "test": "agbenchmark/challenges/memory/m1/m1_test.py" }, + "TestRememberMultipleIds": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" + }, + "TestRetrieval": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" + }, "TestWriteFile": { "difficulty": "basic", "dependencies": [], @@ -19,31 +44,11 @@ "dependencies": [], "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" }, - "TestRetrieval": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" - }, "TestReadFile": { "difficulty": "basic", "dependencies": [ "basic_write_file" ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" - }, - "TestRememberMultipleIds": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" - }, - "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" - }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" } } \ No newline at end of file -- cgit v1.2.3 From 6ef32a9b1f83ee5d628bcbcc9199374b84230a23 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 7 Jul 2023 13:55:59 -0700 Subject: Add "Debug code without guidance" challenge (#66) Signed-off-by: Merwane Hamadi --- .../d1/debug_simple_typo_with_guidance_data.json | 2 +- .../challenges/code/d2/artifacts_in/__init__.py | 0 .../challenges/code/d2/artifacts_in/code.py | 13 +++++++++ .../challenges/code/d2/artifacts_in/test.py | 31 +++++++++++++++++++++ .../challenges/code/d2/artifacts_out/__init__.py | 0 .../challenges/code/d2/artifacts_out/code.py | 12 ++++++++ .../challenges/code/d2/artifacts_out/test.py | 31 +++++++++++++++++++++ agbenchmark/challenges/code/d2/d2_data.json | 22 +++++++++++++++ agbenchmark/challenges/code/d2/d2_test.py | 32 ++++++++++++++++++++++ agbenchmark/mocks/mock_manager.py | 4 +-- mypy.ini | 1 + regression_tests.json | 5 ++++ 12 files changed, 150 insertions(+), 3 deletions(-) create mode 100644 agbenchmark/challenges/code/d2/artifacts_in/__init__.py create mode 100644 agbenchmark/challenges/code/d2/artifacts_in/code.py create mode 100644 agbenchmark/challenges/code/d2/artifacts_in/test.py create mode 100644 agbenchmark/challenges/code/d2/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d2/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d2/artifacts_out/test.py create mode 100644 agbenchmark/challenges/code/d2/d2_data.json create mode 100644 agbenchmark/challenges/code/d2/d2_test.py diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json index ce9d92987..c29c3d83a 100644 --- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json @@ -4,7 +4,7 @@ "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": [], "ground": { - "answer": "2314", + "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], "should_not_contain": [], "files": ["test.py"], diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2/artifacts_in/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2/artifacts_in/code.py new file mode 100644 index 000000000..df8120bfa --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_in/code.py @@ -0,0 +1,13 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + typo + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2/artifacts_in/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_in/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2/artifacts_out/code.py new file mode 100644 index 000000000..de3d8c62c --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2/artifacts_out/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d2/artifacts_out/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2/d2_data.json b/agbenchmark/challenges/code/d2/d2_data.json new file mode 100644 index 000000000..6003055a8 --- /dev/null +++ b/agbenchmark/challenges/code/d2/d2_data.json @@ -0,0 +1,22 @@ +{ + "name": "debug_simple_typo_without_guidance", + "category": ["code"], + "task": "Make test.py run without errors.", + "dependencies": [], + "ground": { + "answer": "[0, 1] [2, 5] [0, 3]", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "mock": { + "mock_func": null, + "mock_task": null + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py new file mode 100644 index 000000000..d49f9dfe9 --- /dev/null +++ b/agbenchmark/challenges/code/d2/d2_test.py @@ -0,0 +1,32 @@ +import os +from typing import Any, Dict + +import pytest + +from agbenchmark.challenges.code.code import CodeChallenge + + +class TestDebugSimpleTypoWithoutGuidance(CodeChallenge): + """The first memory challenge""" + + def get_file_path(self) -> str: # all tests must implement this method + return os.path.join(os.path.dirname(__file__), "d2_data.json") + + @pytest.mark.depends( + name="test_debug_simple_typo_without_guidance", + depends=["test_debug_simple_typo_with_guidance"], + ) + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + assert 1 in scores diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py index 5b84965c3..57c03405d 100644 --- a/agbenchmark/mocks/mock_manager.py +++ b/agbenchmark/mocks/mock_manager.py @@ -1,11 +1,11 @@ -from typing import Any, Dict +from typing import Any, Dict, Optional import agbenchmark.mocks.tests.basic_mocks as basic_mocks import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks class MockManager: - def __init__(self, task: str, config: Dict[str, Any]) -> None: + def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None: self.task = task self.workspace = config["workspace"] self.modules = [basic_mocks, retrieval_mocks] diff --git a/mypy.ini b/mypy.ini index ceb13fcd2..764c239f1 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,4 +1,5 @@ [mypy] +namespace_packages = True follow_imports = skip check_untyped_defs = True disallow_untyped_defs = True diff --git a/regression_tests.json b/regression_tests.json index 3c8988a1b..59a9694bf 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -50,5 +50,10 @@ "basic_write_file" ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" + }, + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/code/d2/d2_test.py" } } \ No newline at end of file -- cgit v1.2.3 From e61523e59ed1a5582ce4a81699faef5bc36bcd16 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 7 Jul 2023 13:58:17 -0700 Subject: Get rid of get file path by using the data.json convention to store the challenge information (#67) Signed-off-by: Merwane Hamadi --- agbenchmark/README.md | 3 --- agbenchmark/challenge.py | 16 ++++--------- agbenchmark/challenges/code/d1/data.json | 22 ++++++++++++++++++ .../d1/debug_simple_typo_with_guidance_data.json | 22 ------------------ .../d1/debug_simple_typo_with_guidance_test.py | 6 ----- agbenchmark/challenges/code/d2/d2_data.json | 22 ------------------ agbenchmark/challenges/code/d2/d2_test.py | 4 ---- agbenchmark/challenges/code/d2/data.json | 22 ++++++++++++++++++ agbenchmark/challenges/memory/m1/data.json | 22 ++++++++++++++++++ agbenchmark/challenges/memory/m1/m1_data.json | 22 ------------------ agbenchmark/challenges/memory/m1/m1_test.py | 4 ---- agbenchmark/challenges/memory/m2/data.json | 22 ++++++++++++++++++ .../memory/m2/remember_multiple_ids_data.json | 22 ------------------ .../memory/m2/remember_multiple_ids_test.py | 6 ----- agbenchmark/challenges/memory/m3/data.json | 22 ++++++++++++++++++ .../m3/remember_multiple_ids_with_noise_data.json | 22 ------------------ .../m3/remember_multiple_ids_with_noise_test.py | 6 ----- agbenchmark/challenges/memory/m4/data.json | 27 ++++++++++++++++++++++ .../remember_multiple_phrases_with_noise_data.json | 27 ---------------------- .../remember_multiple_phrases_with_noise_test.py | 6 ----- agbenchmark/challenges/retrieval/r1/data.json | 22 ++++++++++++++++++ agbenchmark/challenges/retrieval/r1/r1_data.json | 22 ------------------ agbenchmark/challenges/retrieval/r1/r1_test.py | 4 ---- agbenchmark/challenges/retrieval/r2/data.json | 22 ++++++++++++++++++ agbenchmark/challenges/retrieval/r2/r2_data.json | 22 ------------------ agbenchmark/challenges/retrieval/r2/r2_test.py | 4 ---- agbenchmark/challenges/retrieval/r3/data.json | 22 ++++++++++++++++++ agbenchmark/challenges/retrieval/r3/r3_data.json | 22 ------------------ agbenchmark/challenges/retrieval/r3/r3_test.py | 4 ---- .../tests/basic_abilities/read_file/data.json | 20 ++++++++++++++++ .../basic_abilities/read_file/r_file_data.json | 20 ---------------- .../basic_abilities/read_file/read_file_test.py | 4 ---- .../tests/basic_abilities/write_file/data.json | 22 ++++++++++++++++++ .../basic_abilities/write_file/w_file_data.json | 22 ------------------ .../basic_abilities/write_file/write_file_test.py | 4 ---- 35 files changed, 249 insertions(+), 312 deletions(-) create mode 100644 agbenchmark/challenges/code/d1/data.json delete mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json delete mode 100644 agbenchmark/challenges/code/d2/d2_data.json create mode 100644 agbenchmark/challenges/code/d2/data.json create mode 100644 agbenchmark/challenges/memory/m1/data.json delete mode 100644 agbenchmark/challenges/memory/m1/m1_data.json create mode 100644 agbenchmark/challenges/memory/m2/data.json delete mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json create mode 100644 agbenchmark/challenges/memory/m3/data.json delete mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json create mode 100644 agbenchmark/challenges/memory/m4/data.json delete mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json create mode 100644 agbenchmark/challenges/retrieval/r1/data.json delete mode 100644 agbenchmark/challenges/retrieval/r1/r1_data.json create mode 100644 agbenchmark/challenges/retrieval/r2/data.json delete mode 100644 agbenchmark/challenges/retrieval/r2/r2_data.json create mode 100644 agbenchmark/challenges/retrieval/r3/data.json delete mode 100644 agbenchmark/challenges/retrieval/r3/r3_data.json create mode 100644 agbenchmark/tests/basic_abilities/read_file/data.json delete mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json create mode 100644 agbenchmark/tests/basic_abilities/write_file/data.json delete mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json diff --git a/agbenchmark/README.md b/agbenchmark/README.md index a478f83f3..01f602dc6 100644 --- a/agbenchmark/README.md +++ b/agbenchmark/README.md @@ -53,9 +53,6 @@ import os class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "w_file_data.json") - @pytest.mark.depends(on=[], name="basic_write_file") def test_method(self, workspace): # implement scoring logic by looking at workspace diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index 4c8e69848..29bc3ff91 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -3,7 +3,7 @@ import inspect import os import subprocess import types -from abc import ABC, ABCMeta, abstractmethod +from abc import ABC, ABCMeta from typing import Any, Dict, List, Optional, Tuple, Type, cast import pytest @@ -35,20 +35,12 @@ class Challenge(ABC, metaclass=ChallengeMeta): Defines helper methods for running a challenge""" _data_cache: Dict[str, ChallengeData] = {} - - @abstractmethod - def get_file_path(self) -> str: - """This should be implemented by any class which inherits from BasicChallenge""" - pass + CHALLENGE_LOCATION: str @property def data(self) -> ChallengeData: - "Check if the data is already loaded, if not load it" - file_path = ( - self.get_file_path() - ) # file_path serves as the key in the cache dictionary - if file_path not in Challenge._data_cache: - Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path) + file_path = f"{self.CHALLENGE_LOCATION}/data.json" + Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path) return Challenge._data_cache[file_path] @property diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json new file mode 100644 index 000000000..c29c3d83a --- /dev/null +++ b/agbenchmark/challenges/code/d1/data.json @@ -0,0 +1,22 @@ +{ + "name": "debug_simple_typo_with_guidance", + "category": ["code"], + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "dependencies": [], + "ground": { + "answer": "[0, 1] [2, 5] [0, 3]", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "mock": { + "mock_func": null, + "mock_task": null + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json deleted file mode 100644 index c29c3d83a..000000000 --- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "debug_simple_typo_with_guidance", - "category": ["code"], - "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", - "dependencies": [], - "ground": { - "answer": "[0, 1] [2, 5] [0, 3]", - "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], - "should_not_contain": [], - "files": ["test.py"], - "type": "execute_python_code" - }, - "mock": { - "mock_func": null, - "mock_task": null - }, - "info": { - "difficulty": "basic", - "description": "Tests ability for the agent to debug python code with a simple typo in it.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py index e5f50c700..16a12ae41 100644 --- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,11 +8,6 @@ from agbenchmark.challenges.code.code import CodeChallenge class TestDebugSimpleTypoWithGuidance(CodeChallenge): """The first memory challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join( - os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json" - ) - @pytest.mark.depends(name="test_debug_simple_typo_with_guidance") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/challenges/code/d2/d2_data.json b/agbenchmark/challenges/code/d2/d2_data.json deleted file mode 100644 index 6003055a8..000000000 --- a/agbenchmark/challenges/code/d2/d2_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "debug_simple_typo_without_guidance", - "category": ["code"], - "task": "Make test.py run without errors.", - "dependencies": [], - "ground": { - "answer": "[0, 1] [2, 5] [0, 3]", - "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], - "should_not_contain": [], - "files": ["test.py"], - "type": "execute_python_code" - }, - "mock": { - "mock_func": null, - "mock_task": null - }, - "info": { - "difficulty": "basic", - "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py index d49f9dfe9..7a5988b94 100644 --- a/agbenchmark/challenges/code/d2/d2_test.py +++ b/agbenchmark/challenges/code/d2/d2_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,9 +8,6 @@ from agbenchmark.challenges.code.code import CodeChallenge class TestDebugSimpleTypoWithoutGuidance(CodeChallenge): """The first memory challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "d2_data.json") - @pytest.mark.depends( name="test_debug_simple_typo_without_guidance", depends=["test_debug_simple_typo_with_guidance"], diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json new file mode 100644 index 000000000..6003055a8 --- /dev/null +++ b/agbenchmark/challenges/code/d2/data.json @@ -0,0 +1,22 @@ +{ + "name": "debug_simple_typo_without_guidance", + "category": ["code"], + "task": "Make test.py run without errors.", + "dependencies": [], + "ground": { + "answer": "[0, 1] [2, 5] [0, 3]", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "mock": { + "mock_func": null, + "mock_task": null + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json new file mode 100644 index 000000000..3e410ac53 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/data.json @@ -0,0 +1,22 @@ +{ + "name": "basic_memory", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "2314", + "should_contain": ["2314"], + "should_not_contain": [], + "files": ["file_to_check.txt"], + "type": "file" + }, + "mock": { + "mock_func": "basic_memory_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json deleted file mode 100644 index 3e410ac53..000000000 --- a/agbenchmark/challenges/memory/m1/m1_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "basic_memory", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], - "ground": { - "answer": "2314", - "should_contain": ["2314"], - "should_not_contain": [], - "files": ["file_to_check.txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_memory_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, - "info": { - "difficulty": "basic", - "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py index c1f370244..9e5e0a775 100644 --- a/agbenchmark/challenges/memory/m1/m1_test.py +++ b/agbenchmark/challenges/memory/m1/m1_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,9 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge class TestBasicMemory(MemoryChallenge): """The first memory challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "m1_data.json") - @pytest.mark.depends(name="test_basic_memory") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json new file mode 100644 index 000000000..29d7339b8 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/data.json @@ -0,0 +1,22 @@ +{ + "name": "remember_multiple_ids", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "3145\n3791\n9317\n9471", + "should_contain": ["3145", "3791", "9317", "9471"], + "should_not_contain": [], + "files": ["file_to_check.txt"], + "type": "file" + }, + "mock": { + "mock_func": "remember_multiple_ids_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json deleted file mode 100644 index 29d7339b8..000000000 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "remember_multiple_ids", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], - "ground": { - "answer": "3145\n3791\n9317\n9471", - "should_contain": ["3145", "3791", "9317", "9471"], - "should_not_contain": [], - "files": ["file_to_check.txt"], - "type": "file" - }, - "mock": { - "mock_func": "remember_multiple_ids_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, - "info": { - "difficulty": "basic", - "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py index f0f2b3971..6ba38dad3 100644 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,11 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge class TestRememberMultipleIds(MemoryChallenge): """The first memory challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join( - os.path.dirname(__file__), "remember_multiple_ids_data.json" - ) - @pytest.mark.depends( name="test_remember_multiple_ids", depends=["test_basic_memory"] ) diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json new file mode 100644 index 000000000..6b53c4571 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/data.json @@ -0,0 +1,22 @@ +{ + "name": "remember_multiple_ids_with_noise_mock", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "3145\n3791\n9317\n9471", + "should_contain": ["3145", "3791", "9317", "9471"], + "should_not_contain": [], + "files": ["file_to_check.txt"], + "type": "file" + }, + "mock": { + "mock_func": "remember_multiple_ids_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "medium", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json deleted file mode 100644 index 6b53c4571..000000000 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "remember_multiple_ids_with_noise_mock", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], - "ground": { - "answer": "3145\n3791\n9317\n9471", - "should_contain": ["3145", "3791", "9317", "9471"], - "should_not_contain": [], - "files": ["file_to_check.txt"], - "type": "file" - }, - "mock": { - "mock_func": "remember_multiple_ids_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, - "info": { - "difficulty": "medium", - "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py index 493ea3574..037a6929e 100644 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,11 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge class TestRememberMultipleIdsWithNoise(MemoryChallenge): """The first memory challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join( - os.path.dirname(__file__), "remember_multiple_ids_with_noise_data.json" - ) - @pytest.mark.depends( name="test_remember_multiple_ids_with_noise", depends=["test_remember_multiple_ids"], diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json new file mode 100644 index 000000000..316ef9476 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/data.json @@ -0,0 +1,27 @@ +{ + "name": "remember_multiple_phrases_with_noise_mock", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": [], + "ground": { + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "should_contain": [ + "The purple elephant danced on a rainbow while eating a taco", + "The sneaky toaster stole my socks and ran away to Hawaii", + "My pet rock sings better than Beyoncé on Tuesdays", + "The giant hamster rode a unicycle through the crowded mall" + ], + "should_not_contain": [], + "files": ["file_to_check.txt"], + "type": "file" + }, + "mock": { + "mock_func": "remember_multiple_phrases_with_noise_mock", + "mock_task": "Follow the instructions in the instructions_1.txt file" + }, + "info": { + "difficulty": "medium", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json deleted file mode 100644 index 316ef9476..000000000 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "remember_multiple_phrases_with_noise_mock", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], - "ground": { - "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", - "should_contain": [ - "The purple elephant danced on a rainbow while eating a taco", - "The sneaky toaster stole my socks and ran away to Hawaii", - "My pet rock sings better than Beyoncé on Tuesdays", - "The giant hamster rode a unicycle through the crowded mall" - ], - "should_not_contain": [], - "files": ["file_to_check.txt"], - "type": "file" - }, - "mock": { - "mock_func": "remember_multiple_phrases_with_noise_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, - "info": { - "difficulty": "medium", - "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py index e37e9a385..2c931af8c 100644 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,11 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge class TestRememberMultiplePhrasesWithNoise(MemoryChallenge): """The first memory challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join( - os.path.dirname(__file__), "remember_multiple_phrases_with_noise_data.json" - ) - @pytest.mark.depends( name="test_remember_multiple_phrases_with_noise", depends=["test_remember_multiple_ids_with_noise"], diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json new file mode 100644 index 000000000..8fca01b78 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -0,0 +1,22 @@ +{ + "name": "basic_information_retrieval", + "category": ["retrieval"], + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "dependencies": [], + "ground": { + "answer": "£25.89", + "should_contain": ["25.89"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "mock": { + "mock_func": "basic_retrieval_mock", + "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file." + }, + "info": { + "difficulty": "basic", + "description": "Tests ability to retrieve information from a website.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json deleted file mode 100644 index 8fca01b78..000000000 --- a/agbenchmark/challenges/retrieval/r1/r1_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "basic_information_retrieval", - "category": ["retrieval"], - "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", - "dependencies": [], - "ground": { - "answer": "£25.89", - "should_contain": ["25.89"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_retrieval_mock", - "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file." - }, - "info": { - "difficulty": "basic", - "description": "Tests ability to retrieve information from a website.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 285b8affc..68d3de4e3 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,9 +8,6 @@ from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge class TestRetrieval(RetrievalChallenge): """The first information-retrieval challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "r1_data.json") - @pytest.mark.depends(name="test_retrieval") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json new file mode 100644 index 000000000..3c388f192 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -0,0 +1,22 @@ +{ + "name": "basic_information_retrieval", + "category": ["retrieval"], + "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "dependencies": [], + "ground": { + "answer": "81,462", + "should_contain": ["81,462"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "mock": { + "mock_func": "basic_retrieval_2_mock", + "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "info": { + "difficulty": "basic", + "description": "Tests ability to retrieve information.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json deleted file mode 100644 index 3c388f192..000000000 --- a/agbenchmark/challenges/retrieval/r2/r2_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "basic_information_retrieval", - "category": ["retrieval"], - "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": [], - "ground": { - "answer": "81,462", - "should_contain": ["81,462"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_retrieval_2_mock", - "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." - }, - "info": { - "difficulty": "basic", - "description": "Tests ability to retrieve information.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py index ba727b8ed..5a1a20690 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,9 +8,6 @@ from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge class TestRetrieval2(RetrievalChallenge): """The first information-retrieval challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "r2_data.json") - @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json new file mode 100644 index 000000000..415456155 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -0,0 +1,22 @@ +{ + "name": "basic_information_retrieval", + "category": ["retrieval"], + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "dependencies": [], + "ground": { + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "mock": { + "mock_func": "basic_retrieval_3_mock", + "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." + }, + "info": { + "difficulty": "basic", + "description": "Tests ability to retrieve information.", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json deleted file mode 100644 index 415456155..000000000 --- a/agbenchmark/challenges/retrieval/r3/r3_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "basic_information_retrieval", - "category": ["retrieval"], - "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": [], - "ground": { - "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", - "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_retrieval_3_mock", - "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." - }, - "info": { - "difficulty": "basic", - "description": "Tests ability to retrieve information.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py index b58f42672..c4b4bcf12 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,9 +8,6 @@ from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge class TestRetrieval3(RetrievalChallenge): """The first information-retrieval challenge""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "r3_data.json") - @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/tests/basic_abilities/read_file/data.json b/agbenchmark/tests/basic_abilities/read_file/data.json new file mode 100644 index 000000000..7463d22fc --- /dev/null +++ b/agbenchmark/tests/basic_abilities/read_file/data.json @@ -0,0 +1,20 @@ +{ + "name": "basic_read_file", + "category": ["basic"], + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": ["basic_write_file"], + "ground": { + "answer": "random string Hello World!", + "should_contain": ["random string", "Hello World!"], + "files": ["file_to_check.txt"], + "type": "file" + }, + "mock": { + "mock_func": "basic_read_file_mock" + }, + "info": { + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] + } +} diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json deleted file mode 100644 index 7463d22fc..000000000 --- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "name": "basic_read_file", - "category": ["basic"], - "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["basic_write_file"], - "ground": { - "answer": "random string Hello World!", - "should_contain": ["random string", "Hello World!"], - "files": ["file_to_check.txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_read_file_mock" - }, - "info": { - "description": "This reads the file quickly", - "difficulty": "basic", - "side_effects": [""] - } -} diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py index 7c38d2832..cf5dceb69 100644 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,9 +8,6 @@ from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge class TestReadFile(BasicChallenge): """Testing if LLM can read a file""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "r_file_data.json") - @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) diff --git a/agbenchmark/tests/basic_abilities/write_file/data.json b/agbenchmark/tests/basic_abilities/write_file/data.json new file mode 100644 index 000000000..9232a45a0 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/data.json @@ -0,0 +1,22 @@ +{ + "name": "basic_write_file", + "category": ["basic"], + "task": "Print the the capital of America to a .txt file", + "dependencies": [], + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": [".txt"], + "type": "file" + }, + "mock": { + "mock_func": "basic_write_file_mock", + "mock_task": "What is the capital of America?" + }, + "info": { + "difficulty": "basic", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json deleted file mode 100644 index 9232a45a0..000000000 --- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "basic_write_file", - "category": ["basic"], - "task": "Print the the capital of America to a .txt file", - "dependencies": [], - "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": [".txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_write_file_mock", - "mock_task": "What is the capital of America?" - }, - "info": { - "difficulty": "basic", - "description": "Tests the writing to file", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py index 474d67127..ba0395186 100644 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py @@ -1,4 +1,3 @@ -import os from typing import Any, Dict import pytest @@ -9,9 +8,6 @@ from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" - def get_file_path(self) -> str: # all tests must implement this method - return os.path.join(os.path.dirname(__file__), "w_file_data.json") - @pytest.mark.depends(name="basic_write_file") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) -- cgit v1.2.3 From 4562bc6caf5008d65ccd7f0cb38df0521039cada Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Fri, 7 Jul 2023 17:54:09 -0400 Subject: Update data.json remove text --- agbenchmark/challenges/memory/m1/data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index 3e410ac53..c7d441903 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -7,7 +7,7 @@ "answer": "2314", "should_contain": ["2314"], "should_not_contain": [], - "files": ["file_to_check.txt"], + "files": [".txt"], "type": "file" }, "mock": { -- cgit v1.2.3 From 3defe044bd55b050e5d00b3fe9a3f4ce3dca6555 Mon Sep 17 00:00:00 2001 From: Erik Peterson Date: Fri, 7 Jul 2023 15:02:08 -0700 Subject: Print out all of stdout on each process poll. (#69) --- agbenchmark/agent_interface.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 05540f6d3..77eb110bf 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -61,8 +61,9 @@ def run_agent( while True: if process.stdout is None: continue - output = process.stdout.readline() - print(output.strip()) + + while output := process.stdout.readline(): + print(output.strip()) # Check if process has ended if process.poll() is not None: -- cgit v1.2.3 From e34c83ca1c516e84a75d81b775723cd6346776ee Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 7 Jul 2023 15:34:57 -0700 Subject: Add .txt to memory challenges (#70) --- agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt | 2 +- agbenchmark/challenges/memory/m2/data.json | 2 +- agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt | 2 +- agbenchmark/challenges/memory/m3/data.json | 2 +- agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt | 2 +- agbenchmark/challenges/memory/m4/data.json | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt index 92a6fba94..2f60afa17 100644 --- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt +++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt @@ -1 +1 @@ -Write the ids previously mentioned to a file called file_to_check.txt. +Write the ids previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 29d7339b8..6e898298a 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -7,7 +7,7 @@ "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": ["file_to_check.txt"], + "files": [".txt"], "type": "file" }, "mock": { diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt index 1c3f156e3..db609b4b9 100644 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt +++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt @@ -1,5 +1,5 @@ sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 -Write the ids previously mentioned to a file called file_to_check.txt. +Write the ids previously mentioned to a to a .txt file. gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index 6b53c4571..2a4f06ff7 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -7,7 +7,7 @@ "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], "should_not_contain": [], - "files": ["file_to_check.txt"], + "files": [".txt"], "type": "file" }, "mock": { diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt index 729f0aa0d..ee490e6c9 100644 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt +++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt @@ -1,5 +1,5 @@ sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 -Write the phrases previously mentioned to a file called file_to_check.txt. +Write the phrases previously mentioned to a to a .txt file. gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index 316ef9476..adfd8e33f 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -12,7 +12,7 @@ "The giant hamster rode a unicycle through the crowded mall" ], "should_not_contain": [], - "files": ["file_to_check.txt"], + "files": [".txt"], "type": "file" }, "mock": { -- cgit v1.2.3 From f0f7d2be900ef9349b5dcd674c1ae862649c7f0a Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 7 Jul 2023 15:38:50 -0700 Subject: Fix memory challenge 2 (#71) --- agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt | 2 +- agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt index db609b4b9..99c9efa35 100644 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt +++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt @@ -1,5 +1,5 @@ sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 -Write the ids previously mentioned to a to a .txt file. +Write the ids previously mentioned to a .txt file. gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt index ee490e6c9..b4ef60bad 100644 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt +++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt @@ -1,5 +1,5 @@ sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 -Write the phrases previously mentioned to a to a .txt file. +Write the phrases previously mentioned to a .txt file. gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 -- cgit v1.2.3 From 487f99f8f2d0e1294bd261da5650dfb98d6e884c Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 7 Jul 2023 15:49:37 -0700 Subject: Use artifacts out insted of python code (#72) --- .../memory/m1/artifacts_out/random_file.txt | 1 + .../memory/m2/artifacts_out/random_file.txt | 4 ++ .../memory/m3/artifacts_out/random_file.txt | 4 ++ .../memory/m4/artifacts_out/random_file.txt | 4 ++ .../retrieval/r1/artifacts_out/random_file.txt | 1 + .../retrieval/r2/artifacts_out/random_file.txt | 1 + .../retrieval/r3/artifacts_out/random_file.txt | 15 +++++ agbenchmark/mocks/mock_manager.py | 3 +- agbenchmark/mocks/tests/basic_mocks.py | 70 +--------------------- agbenchmark/mocks/tests/retrieval_mocks.py | 5 -- .../write_file/artifacts_out/random_file.txt | 1 + regression_tests.json | 24 ++++---- 12 files changed, 46 insertions(+), 87 deletions(-) create mode 100644 agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt delete mode 100644 agbenchmark/mocks/tests/retrieval_mocks.py create mode 100644 agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt new file mode 100644 index 000000000..86be9d159 --- /dev/null +++ b/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt @@ -0,0 +1 @@ +2314 diff --git a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt new file mode 100644 index 000000000..7d48aaf18 --- /dev/null +++ b/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt @@ -0,0 +1,4 @@ +3145 +3791 +9317 +9471 diff --git a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt new file mode 100644 index 000000000..7d48aaf18 --- /dev/null +++ b/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt @@ -0,0 +1,4 @@ +3145 +3791 +9317 +9471 diff --git a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt new file mode 100644 index 000000000..9b8405bf1 --- /dev/null +++ b/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt @@ -0,0 +1,4 @@ +The purple elephant danced on a rainbow while eating a taco +The sneaky toaster stole my socks and ran away to Hawaii +My pet rock sings better than Beyoncé on Tuesdays +The giant hamster rode a unicycle through the crowded mall diff --git a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt new file mode 100644 index 000000000..f558a0f94 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt @@ -0,0 +1 @@ +25.89 diff --git a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt new file mode 100644 index 000000000..8a0eae046 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt @@ -0,0 +1 @@ +81,462 Millions diff --git a/agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt new file mode 100644 index 000000000..d8d5bd162 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt @@ -0,0 +1,15 @@ +15 Millions +112 Millions +117 Millions +204 Millions +413 Millions +2,014 Millions +3,198 Millions +4,046 Millions +7,000 Millions +11,759 Millions +21,461 Millions +24,578 Millions +31,536 Millions +53,823 Millions +81,462 Millions diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py index 57c03405d..3a227e49b 100644 --- a/agbenchmark/mocks/mock_manager.py +++ b/agbenchmark/mocks/mock_manager.py @@ -1,14 +1,13 @@ from typing import Any, Dict, Optional import agbenchmark.mocks.tests.basic_mocks as basic_mocks -import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks class MockManager: def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None: self.task = task self.workspace = config["workspace"] - self.modules = [basic_mocks, retrieval_mocks] + self.modules = [basic_mocks] def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None: if hasattr(self, mock_function_name): diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py index 32149eb83..e4a1dedc0 100644 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ b/agbenchmark/mocks/tests/basic_mocks.py @@ -1,78 +1,12 @@ from agbenchmark.challenge import Challenge -def basic_write_file_mock(task: str, workspace: str) -> None: +def example_mock(task: str, workspace: str) -> None: """ This mock writes to a file (creates one if it doesn't exist) """ Challenge.write_to_file( workspace, "file_to_check.txt", - "Washington DC is the capital of the United States of America", - ) - - -def basic_retrieval_mock(task: str, workspace: str) -> None: - """ - This mock writes to a file (creates one if it doesn't exist) - """ - Challenge.write_to_file( - workspace, - "file_to_check.txt", - "25.89", - ) - - -def basic_retrieval_2_mock(task: str, workspace: str) -> None: - """ - This mock writes to a file (creates one if it doesn't exist) - """ - Challenge.write_to_file( - workspace, - "file_to_check.txt", - "81,462", - ) - - -def basic_retrieval_3_mock(task: str, workspace: str) -> None: - """ - This mock writes to a file (creates one if it doesn't exist) - """ - Challenge.write_to_file( - workspace, - "file_to_check.txt", - "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", - ) - - -def basic_memory_mock(task: str, workspace: str) -> None: - """ - This mock writes to a file (creates one if it doesn't exist) - """ - Challenge.write_to_file( - workspace, - "file_to_check.txt", - "2314", - ) - - -def remember_multiple_ids_mock(task: str, workspace: str) -> None: - """ - This mock writes to a file (creates one if it doesn't exist) - """ - Challenge.write_to_file( - workspace, - "file_to_check.txt", - "3145\n3791\n9317\n9471", - ) - - -def remember_multiple_phrases_with_noise_mock(task: str, workspace: str) -> None: - """ - This mock writes to a file (creates one if it doesn't exist) - """ - Challenge.write_to_file( - workspace, - "file_to_check.txt", - "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "This is an example showing how you can use mocks but here you can use artifacts_out folder instead of a mock.", ) diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py deleted file mode 100644 index 9a8a57db4..000000000 --- a/agbenchmark/mocks/tests/retrieval_mocks.py +++ /dev/null @@ -1,5 +0,0 @@ -# TODO: Make it so that you can specify for tests to only run if their prerequisites are met. -# Prerequisites here would be writing to a file (basic_abilities test). -# Should also check if prerequisites exists in regression file -def retrieval_1_mock(task: str, workspace: str) -> None: - pass diff --git a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt b/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt new file mode 100644 index 000000000..1f275fb98 --- /dev/null +++ b/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt @@ -0,0 +1 @@ +Washington diff --git a/regression_tests.json b/regression_tests.json index 59a9694bf..9714d42a8 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -4,6 +4,11 @@ "dependencies": [], "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py" }, + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/code/d2/d2_test.py" + }, "TestBasicMemory": { "difficulty": "basic", "dependencies": [], @@ -19,11 +24,6 @@ "dependencies": [], "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" - }, "TestRetrieval": { "difficulty": "basic", "dependencies": [], @@ -39,11 +39,6 @@ "dependencies": [], "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" }, - "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" - }, "TestReadFile": { "difficulty": "basic", "dependencies": [ @@ -51,9 +46,14 @@ ], "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" }, - "TestDebugSimpleTypoWithoutGuidance": { + "TestRetrieval3": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/challenges/code/d2/d2_test.py" + "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [], + "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" } } \ No newline at end of file -- cgit v1.2.3 From e56b112aabbd862c97db48dd5d60d09efbedd5b7 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 8 Jul 2023 03:27:31 -0400 Subject: i/o workspace, adding superagi (#60) --- .github/workflows/superagi.yml | 62 ++++++++++++++++++++++++++++++++++++++++++ .gitmodules | 4 +++ agbenchmark/agent_interface.py | 2 +- agbenchmark/challenge.py | 10 +++---- agbenchmark/conftest.py | 57 +++++++++++++++++++++++++------------- agent/SuperAGI | 1 + config.json | 6 ++-- 7 files changed, 114 insertions(+), 28 deletions(-) create mode 100644 .github/workflows/superagi.yml create mode 160000 agent/SuperAGI diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml new file mode 100644 index 000000000..128c28dd7 --- /dev/null +++ b/.github/workflows/superagi.yml @@ -0,0 +1,62 @@ +name: SuperAgi Regression Test + +on: + workflow_dispatch: + branches: [master] + push: + branches: [stable, master, ci-test*] + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ['3.10'] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + poetry install --only main + poetry build + + - name: Run regression tests + run: | + cd agent/SuperAgi + cp config_template.yaml config.yaml + sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml + docker-compose up --build + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --reg diff --git a/.gitmodules b/.gitmodules index 5af445f7a..f14b5e07d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,3 +14,7 @@ path = agent/smol-developer url = https://github.com/merwanehamadi/developer.git branch = benchmark-integration +[submodule "agent/SuperAGI"] + path = agent/SuperAGI + url = https://github.com/SilenNaihin/SuperAGI.git + branch = benchmark-integration diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 77eb110bf..4244fa082 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -15,7 +15,7 @@ MOCK_FLAG = os.getenv("MOCK_TEST") def run_agent( - task: Optional[str], + task: str, mock_func: Optional[str], config: Dict[str, Any], challenge_location: str, diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index 29bc3ff91..d7e1c8965 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -48,8 +48,8 @@ class Challenge(ABC, metaclass=ChallengeMeta): return self.data.mock.mock_func if self.data.mock else None @property - def task(self) -> Optional[str]: - return ( + def task(self) -> str: + return str( self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task ) @@ -80,13 +80,13 @@ class Challenge(ABC, metaclass=ChallengeMeta): @staticmethod def open_file(workspace: str, filename: str) -> str: - script_dir = os.path.abspath(workspace) + script_dir = workspace workspace_dir = os.path.join(script_dir, filename) with open(workspace_dir, "r") as f: return f.read() def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]: - script_dir = os.path.abspath(workspace) + script_dir = workspace files_contents = [] for file_pattern in file_patterns: @@ -115,7 +115,7 @@ class Challenge(ABC, metaclass=ChallengeMeta): @staticmethod def write_to_file(workspace: str, filename: str, content: str) -> None: - script_dir = os.path.abspath(workspace) + script_dir = workspace print("Writing file at", script_dir) workspace_dir = os.path.join(script_dir, filename) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 7203ee6bb..40457fb67 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -10,19 +10,24 @@ from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH from agbenchmark.tests.regression.RegressionManager import RegressionManager -def get_dynamic_workspace(config: Dict[str, Any]) -> str: - # Extract the string inside ${...} - path_expr = config["workspace"][2:-1] +def resolve_workspace(config: Dict[str, Any]) -> str: + if config.get("workspace", "").startswith("${") and config.get( + "workspace", "" + ).endswith("}"): + # Extract the string inside ${...} + path_expr = config["workspace"][2:-1] - # Check if it starts with "os.path.join" - if path_expr.strip().startswith("os.path.join"): - # Evaluate the path string - path_value = eval(path_expr) + # Check if it starts with "os.path.join" + if path_expr.strip().startswith("os.path.join"): + # Evaluate the path string + path_value = eval(path_expr) - # Replace the original string with the evaluated result - return path_value + # Replace the original string with the evaluated result + return path_value + else: + raise ValueError("Invalid workspace path expression.") else: - raise ValueError("Invalid workspace path expression.") + return os.path.abspath(Path(os.getcwd()) / config["workspace"]) @pytest.fixture(scope="module") @@ -31,22 +36,36 @@ def config(request: Any) -> None: with open(CONFIG_PATH, "r") as f: config = json.load(f) - if config.get("workspace", "").startswith("${") and config.get( - "workspace", "" - ).endswith("}"): - path = get_dynamic_workspace(config) - config["workspace"] = path - else: - config["workspace"] = Path(os.getcwd()) / config["workspace"] + if request.config.getoption("--mock"): + config["workspace"] = "agbenchmark/mocks/workspace" + elif isinstance(config["workspace"], str): + config["workspace"] = resolve_workspace(config) + else: # it's a input output dict + config["workspace"]["input"] = resolve_workspace(config) + config["workspace"]["output"] = resolve_workspace(config) + return config @pytest.fixture(scope="module", autouse=True) def workspace(config: Dict[str, Any]) -> Generator[str, None, None]: + output_path = config["workspace"] + + # checks if its an input output paradigm + if not isinstance(config["workspace"], str): + output_path = config["workspace"]["output"] + if not os.path.exists(config["workspace"]["input"]): + os.makedirs(config["workspace"]["input"], exist_ok=True) + + # create output directory if it doesn't exist + if not os.path.exists(output_path): + os.makedirs(output_path, exist_ok=True) + yield config["workspace"] # teardown after test function completes - for filename in os.listdir(config["workspace"]): - file_path = os.path.join(config["workspace"], filename) + + for filename in os.listdir(output_path): + file_path = os.path.join(output_path, filename) try: if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) diff --git a/agent/SuperAGI b/agent/SuperAGI new file mode 160000 index 000000000..12e248e90 --- /dev/null +++ b/agent/SuperAGI @@ -0,0 +1 @@ +Subproject commit 12e248e90112e50ee011f0dcb1b3fa02030661a4 diff --git a/config.json b/config.json index 378e69025..88526a134 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "benchmarks.py", - "home_path": "agent/mini-agi/", + "workspace": "projects/my-new-project/workspace", + "entry_path": "agent/gpt-engineer/benchmarks.py", + "home_path": "agent/gpt-engineer", "cutoff": 60 } -- cgit v1.2.3 From 082a87661224d25ed969557113e08f84febfbc12 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 8 Jul 2023 05:04:06 -0400 Subject: fixing the incorrect addition of superagi (#73) --- .github/workflows/superagi.yml | 6 +++--- agent/SuperAGI | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml index 128c28dd7..5ab52d33f 100644 --- a/.github/workflows/superagi.yml +++ b/.github/workflows/superagi.yml @@ -54,9 +54,9 @@ jobs: - name: Run regression tests run: | - cd agent/SuperAgi + cd agent/SuperAGI cp config_template.yaml config.yaml sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml - docker-compose up --build + docker-compose up -d --build pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --reg + agbenchmark start --maintain diff --git a/agent/SuperAGI b/agent/SuperAGI index 12e248e90..7ab2994d4 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit 12e248e90112e50ee011f0dcb1b3fa02030661a4 +Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916 -- cgit v1.2.3 From a35569a77b7b9f9048d340646caa1c853b39a501 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 8 Jul 2023 12:47:48 -0400 Subject: submodule integration --- .github/workflows/agentgpt.yml | 66 ++++++++++++++++++++++++++++++++++++++++++ .gitmodules | 4 +++ agent/AgentGPT | 1 + 3 files changed, 71 insertions(+) create mode 100644 .github/workflows/agentgpt.yml create mode 160000 agent/AgentGPT diff --git a/.github/workflows/agentgpt.yml b/.github/workflows/agentgpt.yml new file mode 100644 index 000000000..8c9b42203 --- /dev/null +++ b/.github/workflows/agentgpt.yml @@ -0,0 +1,66 @@ +name: AgentGPT Regression Test + +on: + workflow_dispatch: + branches: [master] + push: + branches: [stable, master, ci-test*] + +jobs: + regression-tests: + permissions: + pull-requests: write + contents: write + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ['3.10'] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - id: get_date + name: Get date + run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT + + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python - + + - name: Set up Poetry cache + uses: actions/cache@v2 + with: + path: | + ~/.cache/pypoetry + .venv + key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} + + - name: Set up venv and install Python dependencies + run: | + poetry install --only main + poetry build + + - name: Run regression tests + run: | + cd agent/AgentGPT + cd next + npm install + ../ + cp .env_example .env + docker-compose up -d --build + pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl + agbenchmark start --maintain + env: + REWORKD_PLATFORM_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.gitmodules b/.gitmodules index f14b5e07d..ee535b8b2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -18,3 +18,7 @@ path = agent/SuperAGI url = https://github.com/SilenNaihin/SuperAGI.git branch = benchmark-integration +[submodule "agent/AgentGPT"] + path = agent/AgentGPT + url = https://github.com/SilenNaihin/AgentGPT.git + branch = benchmark-integration diff --git a/agent/AgentGPT b/agent/AgentGPT new file mode 160000 index 000000000..8e09b20b2 --- /dev/null +++ b/agent/AgentGPT @@ -0,0 +1 @@ +Subproject commit 8e09b20b2a38f06a38ab6afd16a00ffe2ed514c7 -- cgit v1.2.3 From 2d05c3ec5600e173d288f6714b3c3fc5e0087ae2 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 8 Jul 2023 12:50:39 -0400 Subject: reverting accidental previous changes --- .github/workflows/agentgpt.yml | 66 ------------------------------------------ .gitmodules | 4 --- agent/AgentGPT | 2 +- agent/SuperAGI | 1 - 4 files changed, 1 insertion(+), 72 deletions(-) delete mode 100644 .github/workflows/agentgpt.yml delete mode 160000 agent/SuperAGI diff --git a/.github/workflows/agentgpt.yml b/.github/workflows/agentgpt.yml deleted file mode 100644 index 8c9b42203..000000000 --- a/.github/workflows/agentgpt.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: AgentGPT Regression Test - -on: - workflow_dispatch: - branches: [master] - push: - branches: [stable, master, ci-test*] - -jobs: - regression-tests: - permissions: - pull-requests: write - contents: write - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10'] - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - id: get_date - name: Get date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - - name: Set up venv and install Python dependencies - run: | - poetry install --only main - poetry build - - - name: Run regression tests - run: | - cd agent/AgentGPT - cd next - npm install - ../ - cp .env_example .env - docker-compose up -d --build - pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --maintain - env: - REWORKD_PLATFORM_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.gitmodules b/.gitmodules index ee535b8b2..f14b5e07d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -18,7 +18,3 @@ path = agent/SuperAGI url = https://github.com/SilenNaihin/SuperAGI.git branch = benchmark-integration -[submodule "agent/AgentGPT"] - path = agent/AgentGPT - url = https://github.com/SilenNaihin/AgentGPT.git - branch = benchmark-integration diff --git a/agent/AgentGPT b/agent/AgentGPT index 8e09b20b2..b92ddf858 160000 --- a/agent/AgentGPT +++ b/agent/AgentGPT @@ -1 +1 @@ -Subproject commit 8e09b20b2a38f06a38ab6afd16a00ffe2ed514c7 +Subproject commit b92ddf858529eddb6f17d85875767094f7ea2bfe diff --git a/agent/SuperAGI b/agent/SuperAGI deleted file mode 160000 index 7ab2994d4..000000000 --- a/agent/SuperAGI +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916 -- cgit v1.2.3 From db86ccdcb4217c5c8ab909f7628a00827ab52c42 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 8 Jul 2023 13:02:47 -0400 Subject: removing agentgpt --- agent/AgentGPT | 1 - agent/SuperAGI | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 160000 agent/AgentGPT create mode 160000 agent/SuperAGI diff --git a/agent/AgentGPT b/agent/AgentGPT deleted file mode 160000 index b92ddf858..000000000 --- a/agent/AgentGPT +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b92ddf858529eddb6f17d85875767094f7ea2bfe diff --git a/agent/SuperAGI b/agent/SuperAGI new file mode 160000 index 000000000..7ab2994d4 --- /dev/null +++ b/agent/SuperAGI @@ -0,0 +1 @@ +Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916 -- cgit v1.2.3 From 69bd41f7414c1028e61affd3a340054355d9249a Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 8 Jul 2023 21:43:38 -0400 Subject: Quality of life improvements & fixes (#75) --- .gitignore | 2 +- agbenchmark/README.md | 5 +- agbenchmark/RegressionManager.py | 29 ++++++++++ agbenchmark/agent_interface.py | 21 ++----- agbenchmark/challenge.py | 39 ++++++------- agbenchmark/challenges/README.md | 7 +-- agbenchmark/challenges/code/code.py | 8 --- agbenchmark/challenges/code/d1/data.json | 7 +-- .../d1/debug_simple_typo_with_guidance_test.py | 18 +----- agbenchmark/challenges/code/d2/d2_test.py | 20 +------ agbenchmark/challenges/code/d2/data.json | 9 +-- agbenchmark/challenges/define_task_types.py | 7 --- agbenchmark/challenges/interface/browse_test.py | 0 .../read_file/artifacts_in/file_to_check.txt | 1 + .../read_file/artifacts_out/file_to_check.txt | 1 + .../challenges/interface/read_file/data.json | 17 ++++++ .../interface/read_file/read_file_test.py | 12 ++++ .../write_file/artifacts_out/random_file.txt | 1 + .../challenges/interface/write_file/data.json | 18 ++++++ .../interface/write_file/write_file_test.py | 13 +++++ agbenchmark/challenges/memory/m1/data.json | 3 +- agbenchmark/challenges/memory/m1/m1_test.py | 18 +----- agbenchmark/challenges/memory/m2/data.json | 7 +-- .../memory/m2/remember_multiple_ids_test.py | 20 +------ agbenchmark/challenges/memory/m3/data.json | 7 +-- .../m3/remember_multiple_ids_with_noise_test.py | 21 +------ agbenchmark/challenges/memory/m4/data.json | 7 +-- .../remember_multiple_phrases_with_noise_test.py | 21 +------ agbenchmark/challenges/memory/memory.py | 8 --- agbenchmark/challenges/retrieval/r1/data.json | 7 +-- agbenchmark/challenges/retrieval/r1/r1_test.py | 18 +----- agbenchmark/challenges/retrieval/r2/data.json | 7 +-- agbenchmark/challenges/retrieval/r2/r2_test.py | 18 +----- agbenchmark/challenges/retrieval/r3/data.json | 25 +++++--- agbenchmark/challenges/retrieval/r3/r3_test.py | 17 +----- agbenchmark/challenges/retrieval/retrieval.py | 8 --- agbenchmark/conftest.py | 66 +++++++++++++++++----- agbenchmark/mocks/mock_manager.py | 28 --------- agbenchmark/mocks/tests/basic_mocks.py | 12 ---- agbenchmark/start_benchmark.py | 48 ++++++++-------- .../tests/basic_abilities/basic_challenge.py | 8 --- agbenchmark/tests/basic_abilities/browse_test.py | 0 .../read_file/artifacts_in/file_to_check.txt | 1 - .../read_file/artifacts_out/file_to_check.txt | 1 - .../tests/basic_abilities/read_file/data.json | 20 ------- .../basic_abilities/read_file/read_file_test.py | 24 -------- .../tests/basic_abilities/remember_context_test.py | 0 .../write_file/artifacts_out/random_file.txt | 1 - .../tests/basic_abilities/write_file/data.json | 22 -------- .../basic_abilities/write_file/write_file_test.py | 25 -------- agbenchmark/tests/regression/RegressionManager.py | 29 ---------- agent/Auto-GPT | 2 +- agent/gpt-engineer | 2 +- agent/smol-developer | 2 +- config.json | 6 +- pyproject.toml | 2 +- regression_tests.json | 54 +++++++++--------- 57 files changed, 279 insertions(+), 521 deletions(-) create mode 100644 agbenchmark/RegressionManager.py delete mode 100644 agbenchmark/challenges/code/code.py create mode 100644 agbenchmark/challenges/interface/browse_test.py create mode 100644 agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt create mode 100644 agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt create mode 100644 agbenchmark/challenges/interface/read_file/data.json create mode 100644 agbenchmark/challenges/interface/read_file/read_file_test.py create mode 100644 agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/interface/write_file/data.json create mode 100644 agbenchmark/challenges/interface/write_file/write_file_test.py delete mode 100644 agbenchmark/challenges/memory/memory.py delete mode 100644 agbenchmark/challenges/retrieval/retrieval.py delete mode 100644 agbenchmark/mocks/mock_manager.py delete mode 100644 agbenchmark/mocks/tests/basic_mocks.py delete mode 100644 agbenchmark/tests/basic_abilities/basic_challenge.py delete mode 100644 agbenchmark/tests/basic_abilities/browse_test.py delete mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt delete mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt delete mode 100644 agbenchmark/tests/basic_abilities/read_file/data.json delete mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py delete mode 100644 agbenchmark/tests/basic_abilities/remember_context_test.py delete mode 100644 agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt delete mode 100644 agbenchmark/tests/basic_abilities/write_file/data.json delete mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py delete mode 100644 agbenchmark/tests/regression/RegressionManager.py diff --git a/.gitignore b/.gitignore index 3581dc933..7d0419ca4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -agbenchmark/mocks/workspace/ +agbenchmark/workspace/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/agbenchmark/README.md b/agbenchmark/README.md index 01f602dc6..42e2bd4dd 100644 --- a/agbenchmark/README.md +++ b/agbenchmark/README.md @@ -53,8 +53,7 @@ import os class TestWriteFile(BasicChallenge): """Testing if LLM can write to a file""" - @pytest.mark.depends(on=[], name="basic_write_file") - def test_method(self, workspace): + def test_method(self, config): # implement scoring logic by looking at workspace ``` @@ -82,7 +81,7 @@ Add the below to create a file in the workspace prior to running a challenge. On ## Workspace -If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config +If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config #### Dataset diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py new file mode 100644 index 000000000..a1379ecae --- /dev/null +++ b/agbenchmark/RegressionManager.py @@ -0,0 +1,29 @@ +import json + + +class RegressionManager: + """Abstracts interaction with the regression tests file""" + + def __init__(self, filename: str): + self.filename = filename + self.load() + + def load(self) -> None: + try: + with open(self.filename, "r") as f: + self.tests = json.load(f) + except (FileNotFoundError, json.decoder.JSONDecodeError): + self.tests = {} + + def save(self) -> None: + with open(self.filename, "w") as f: + json.dump(self.tests, f, indent=4) + + def add_test(self, test_name: str, test_details: dict) -> None: + self.tests[test_name] = test_details + self.save() + + def remove_test(self, test_name: str) -> None: + if test_name in self.tests: + del self.tests[test_name] + self.save() diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 4244fa082..1d43577c7 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -3,37 +3,27 @@ import shutil import subprocess import sys import time -from typing import Any, Dict, Optional +from typing import Any, Dict from dotenv import load_dotenv -from agbenchmark.mocks.mock_manager import MockManager - load_dotenv() -MOCK_FLAG = os.getenv("MOCK_TEST") +mock_test_str = os.getenv("MOCK_TEST") +MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False def run_agent( task: str, - mock_func: Optional[str], config: Dict[str, Any], challenge_location: str, ) -> None: """Calling to get a response""" - if MOCK_FLAG == "True": + if MOCK_FLAG: copy_artifacts_into_workspace( config["workspace"], "artifacts_out", challenge_location ) - if mock_func is None: - print("No mock provided") - return - mock_manager = MockManager( - task, config - ) # workspace doesn't need to be passed in, stays the same - print("Server unavailable, using mock", mock_func) - mock_manager.delegate(mock_func) else: timeout = config["cutoff"] print( @@ -99,6 +89,3 @@ def copy_artifacts_into_workspace( full_file_name = os.path.join(source_dir, file_name) if os.path.isfile(full_file_name): shutil.copy(full_file_name, workspace) - - -ENVIRONMENT = os.getenv("ENVIRONMENT") or "production" diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index d7e1c8965..ddf69f42d 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -4,9 +4,8 @@ import os import subprocess import types from abc import ABC, ABCMeta -from typing import Any, Dict, List, Optional, Tuple, Type, cast +from typing import Any, Dict, List, Tuple, Type, cast -import pytest from dotenv import load_dotenv from agbenchmark.challenges.define_task_types import ChallengeData, Ground @@ -19,7 +18,6 @@ MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False class ChallengeMeta(ABCMeta): def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None: - super().__init__(name, bases, dct) try: frame = cast(types.FrameType, inspect.currentframe()) @@ -40,18 +38,13 @@ class Challenge(ABC, metaclass=ChallengeMeta): @property def data(self) -> ChallengeData: file_path = f"{self.CHALLENGE_LOCATION}/data.json" - Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path) + if file_path not in Challenge._data_cache: + Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path) return Challenge._data_cache[file_path] - @property - def mock(self) -> Optional[str]: - return self.data.mock.mock_func if self.data.mock else None - @property def task(self) -> str: - return str( - self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task - ) + return self.data.task @property def dependencies(self) -> list: @@ -64,17 +57,8 @@ class Challenge(ABC, metaclass=ChallengeMeta): config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION ) - run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION) + run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION) - @property - def name(self) -> str: - return self.data.name - - @pytest.mark.parametrize( - "challenge_data", - [data], - indirect=True, - ) def test_method(self, config: Dict[str, Any]) -> None: raise NotImplementedError @@ -151,3 +135,16 @@ class Challenge(ABC, metaclass=ChallengeMeta): ) return 1.0 + + def get_scores(self, config: Dict[str, Any]) -> List[float]: + files_contents = self.get_artifacts_out( + config["workspace"], self.data.ground.files + ) + + scores = [] + for file_content in files_contents: + score = self.scoring(file_content, self.data.ground) + print("Your score is:", score) + scores.append(score) + + return scores diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 2d782d1fc..305cd28f1 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -25,10 +25,9 @@ Example: ```python { - "name": "basic_write_file", "category": ["basic"], "task": "Print the the capital of America to a .txt file", - "dependencies": [], + "dependencies": ["TestWriteFile"], # the class name of the test "ground": { "answer": "Washington", "should_contain": ["Washington"], @@ -36,10 +35,6 @@ Example: "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "basic_write_file_mock", - "mock_task": "What is the capital of America?" - }, "info": { "difficulty": "basic", "description": "Tests the writing to file", diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py deleted file mode 100644 index 508d24a90..000000000 --- a/agbenchmark/challenges/code/code.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - -from agbenchmark.challenge import Challenge - - -@pytest.mark.code -class CodeChallenge(Challenge): - """Challenge for memory""" diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index c29c3d83a..6ac284b81 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -1,8 +1,7 @@ { - "name": "debug_simple_typo_with_guidance", "category": ["code"], "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", - "dependencies": [], + "dependencies": ["TestReadFile", "TestWriteFile"], "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], @@ -10,10 +9,6 @@ "files": ["test.py"], "type": "execute_python_code" }, - "mock": { - "mock_func": null, - "mock_task": null - }, "info": { "difficulty": "basic", "description": "Tests ability for the agent to debug python code with a simple typo in it.", diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py index 16a12ae41..d104b3374 100644 --- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py +++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py @@ -1,25 +1,13 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.code.code import CodeChallenge - -class TestDebugSimpleTypoWithGuidance(CodeChallenge): +class TestDebugSimpleTypoWithGuidance(Challenge): """The first memory challenge""" - @pytest.mark.depends(name="test_debug_simple_typo_with_guidance") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py index 7a5988b94..b02114a75 100644 --- a/agbenchmark/challenges/code/d2/d2_test.py +++ b/agbenchmark/challenges/code/d2/d2_test.py @@ -1,28 +1,14 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.code.code import CodeChallenge - -class TestDebugSimpleTypoWithoutGuidance(CodeChallenge): +class TestDebugSimpleTypoWithoutGuidance(Challenge): """The first memory challenge""" - @pytest.mark.depends( - name="test_debug_simple_typo_without_guidance", - depends=["test_debug_simple_typo_with_guidance"], - ) def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index 6003055a8..3de5111f5 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -1,8 +1,7 @@ { - "name": "debug_simple_typo_without_guidance", "category": ["code"], "task": "Make test.py run without errors.", - "dependencies": [], + "dependencies": ["TestDebugSimpleTypoWithGuidance"], "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], @@ -10,12 +9,8 @@ "files": ["test.py"], "type": "execute_python_code" }, - "mock": { - "mock_func": null, - "mock_task": null - }, "info": { - "difficulty": "basic", + "difficulty": "medium", "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f84df1262..308cb5ea6 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -4,11 +4,6 @@ from typing import List, Optional from pydantic import BaseModel -class Mock(BaseModel): - mock_func: Optional[str] = None - mock_task: Optional[str] = None - - class Info(BaseModel): difficulty: str description: str @@ -24,12 +19,10 @@ class Ground(BaseModel): class ChallengeData(BaseModel): - name: str category: List[str] task: str dependencies: List[str] ground: Ground - mock: Optional[Mock] = None info: Info def serialize(self, path: str) -> None: diff --git a/agbenchmark/challenges/interface/browse_test.py b/agbenchmark/challenges/interface/browse_test.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt b/agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt new file mode 100644 index 000000000..980a0d5f1 --- /dev/null +++ b/agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt @@ -0,0 +1 @@ +Hello World! diff --git a/agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt b/agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt new file mode 100644 index 000000000..c1a7879a1 --- /dev/null +++ b/agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt @@ -0,0 +1 @@ +random string Hello World! diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json new file mode 100644 index 000000000..dd399fabf --- /dev/null +++ b/agbenchmark/challenges/interface/read_file/data.json @@ -0,0 +1,17 @@ +{ + "name": "ReadFile", + "category": ["interface"], + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "dependencies": ["TestWriteFile"], + "ground": { + "answer": "random string Hello World!", + "should_contain": ["random string", "Hello World!"], + "files": ["file_to_check.txt"], + "type": "file" + }, + "info": { + "description": "This reads the file quickly", + "difficulty": "basic", + "side_effects": [""] + } +} diff --git a/agbenchmark/challenges/interface/read_file/read_file_test.py b/agbenchmark/challenges/interface/read_file/read_file_test.py new file mode 100644 index 000000000..591d0a744 --- /dev/null +++ b/agbenchmark/challenges/interface/read_file/read_file_test.py @@ -0,0 +1,12 @@ +from typing import Any, Dict + +from agbenchmark.challenge import Challenge + + +class TestReadFile(Challenge): + """Testing if LLM can read a file""" + + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + scores = self.get_scores(config) + assert 1 in scores diff --git a/agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt b/agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt new file mode 100644 index 000000000..1f275fb98 --- /dev/null +++ b/agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt @@ -0,0 +1 @@ +Washington diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json new file mode 100644 index 000000000..b3e4b6f02 --- /dev/null +++ b/agbenchmark/challenges/interface/write_file/data.json @@ -0,0 +1,18 @@ +{ + "name": "WriteFile", + "category": ["interface"], + "task": "Print the the capital of America to a .txt file", + "dependencies": [], + "ground": { + "answer": "Washington", + "should_contain": ["Washington"], + "should_not_contain": ["New York", "Los Angeles", "San Francisco"], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "basic", + "description": "Tests the writing to file", + "side_effects": ["tests if there is in fact an LLM attached"] + } +} diff --git a/agbenchmark/challenges/interface/write_file/write_file_test.py b/agbenchmark/challenges/interface/write_file/write_file_test.py new file mode 100644 index 000000000..4a52b0979 --- /dev/null +++ b/agbenchmark/challenges/interface/write_file/write_file_test.py @@ -0,0 +1,13 @@ +from typing import Any, Dict + +from agbenchmark.challenge import Challenge + + +class TestWriteFile(Challenge): + """Testing if LLM can write to a file""" + + def test_method(self, config: Dict[str, Any]) -> None: + self.setup_challenge(config) + + scores = self.get_scores(config) + assert 1 in scores diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index c7d441903..f771a2669 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -1,8 +1,7 @@ { - "name": "basic_memory", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], + "dependencies": ["TestReadFile", "TestWriteFile"], "ground": { "answer": "2314", "should_contain": ["2314"], diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py index 9e5e0a775..0fc537eeb 100644 --- a/agbenchmark/challenges/memory/m1/m1_test.py +++ b/agbenchmark/challenges/memory/m1/m1_test.py @@ -1,25 +1,13 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.memory.memory import MemoryChallenge - -class TestBasicMemory(MemoryChallenge): +class TestBasicMemory(Challenge): """The first memory challenge""" - @pytest.mark.depends(name="test_basic_memory") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 6e898298a..998e894b1 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -1,8 +1,7 @@ { - "name": "remember_multiple_ids", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], + "dependencies": ["TestBasicMemory"], "ground": { "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], @@ -10,10 +9,6 @@ "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "remember_multiple_ids_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, "info": { "difficulty": "basic", "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py index 6ba38dad3..c88f28831 100644 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py +++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py @@ -1,27 +1,13 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.memory.memory import MemoryChallenge - -class TestRememberMultipleIds(MemoryChallenge): +class TestRememberMultipleIds(Challenge): """The first memory challenge""" - @pytest.mark.depends( - name="test_remember_multiple_ids", depends=["test_basic_memory"] - ) def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index 2a4f06ff7..d5d95b1de 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -1,8 +1,7 @@ { - "name": "remember_multiple_ids_with_noise_mock", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], + "dependencies": ["TestRememberMultipleIds"], "ground": { "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], @@ -10,10 +9,6 @@ "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "remember_multiple_ids_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, "info": { "difficulty": "medium", "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py index 037a6929e..0e35dd2f4 100644 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py +++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py @@ -1,28 +1,13 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.memory.memory import MemoryChallenge - -class TestRememberMultipleIdsWithNoise(MemoryChallenge): +class TestRememberMultipleIdsWithNoise(Challenge): """The first memory challenge""" - @pytest.mark.depends( - name="test_remember_multiple_ids_with_noise", - depends=["test_remember_multiple_ids"], - ) def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index adfd8e33f..49831537e 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -1,8 +1,7 @@ { - "name": "remember_multiple_phrases_with_noise_mock", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": [], + "dependencies": ["TestRememberMultipleIdsWithNoise"], "ground": { "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", "should_contain": [ @@ -15,10 +14,6 @@ "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "remember_multiple_phrases_with_noise_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, "info": { "difficulty": "medium", "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py index 2c931af8c..4c4bdce55 100644 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py +++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py @@ -1,28 +1,13 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.memory.memory import MemoryChallenge - -class TestRememberMultiplePhrasesWithNoise(MemoryChallenge): +class TestRememberMultiplePhrasesWithNoise(Challenge): """The first memory challenge""" - @pytest.mark.depends( - name="test_remember_multiple_phrases_with_noise", - depends=["test_remember_multiple_ids_with_noise"], - ) def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/memory/memory.py b/agbenchmark/challenges/memory/memory.py deleted file mode 100644 index 429bef23a..000000000 --- a/agbenchmark/challenges/memory/memory.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - -from agbenchmark.challenge import Challenge - - -@pytest.mark.memory -class MemoryChallenge(Challenge): - """Challenge for memory""" diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index 8fca01b78..6e1344b8b 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -1,8 +1,7 @@ { - "name": "basic_information_retrieval", "category": ["retrieval"], "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", - "dependencies": [], + "dependencies": ["TestWriteFile"], "ground": { "answer": "£25.89", "should_contain": ["25.89"], @@ -10,10 +9,6 @@ "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "basic_retrieval_mock", - "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file." - }, "info": { "difficulty": "basic", "description": "Tests ability to retrieve information from a website.", diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py index 68d3de4e3..9845a7b2a 100644 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ b/agbenchmark/challenges/retrieval/r1/r1_test.py @@ -1,25 +1,13 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge - -class TestRetrieval(RetrievalChallenge): +class TestRetrieval(Challenge): """The first information-retrieval challenge""" - @pytest.mark.depends(name="test_retrieval") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json index 3c388f192..05846b9f3 100644 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -1,8 +1,7 @@ { - "name": "basic_information_retrieval", "category": ["retrieval"], "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": [], + "dependencies": ["TestRetrieval"], "ground": { "answer": "81,462", "should_contain": ["81,462"], @@ -10,10 +9,6 @@ "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "basic_retrieval_2_mock", - "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." - }, "info": { "difficulty": "basic", "description": "Tests ability to retrieve information.", diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py index 5a1a20690..f0f13ffbf 100644 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ b/agbenchmark/challenges/retrieval/r2/r2_test.py @@ -1,25 +1,13 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge - -class TestRetrieval2(RetrievalChallenge): +class TestRetrieval2(Challenge): """The first information-retrieval challenge""" - @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index 415456155..763c963ec 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -1,19 +1,30 @@ { - "name": "basic_information_retrieval", "category": ["retrieval"], "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": [], + "dependencies": ["TestRetrieval2"], "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", - "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"], + "should_contain": [ + "15", + "112", + "117", + "204", + "413", + "2,014", + "3,198", + "4,046", + "7,000", + "11,759", + "21,461", + "24,578", + "31,536", + "53,823", + "81,462" + ], "should_not_contain": [], "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "basic_retrieval_3_mock", - "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)." - }, "info": { "difficulty": "basic", "description": "Tests ability to retrieve information.", diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py index c4b4bcf12..5887c0b43 100644 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ b/agbenchmark/challenges/retrieval/r3/r3_test.py @@ -1,25 +1,14 @@ from typing import Any, Dict -import pytest +from agbenchmark.challenge import Challenge -from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge - -class TestRetrieval3(RetrievalChallenge): +class TestRetrieval3(Challenge): """The first information-retrieval challenge""" - @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3") def test_method(self, config: Dict[str, Any]) -> None: self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) + scores = self.get_scores(config) assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/retrieval.py b/agbenchmark/challenges/retrieval/retrieval.py deleted file mode 100644 index 891cccef7..000000000 --- a/agbenchmark/challenges/retrieval/retrieval.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - -from agbenchmark.challenge import Challenge - - -@pytest.mark.retrieval -class RetrievalChallenge(Challenge): - """Challenge for information-retrieval""" diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 40457fb67..ffbb26202 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -2,12 +2,16 @@ import json import os import shutil from pathlib import Path # noqa -from typing import Any, Dict, Generator, List +from typing import Any, Dict, Generator import pytest -from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH -from agbenchmark.tests.regression.RegressionManager import RegressionManager +from agbenchmark.RegressionManager import RegressionManager +from agbenchmark.start_benchmark import ( + CONFIG_PATH, + REGRESSION_TESTS_PATH, + get_regression_data, +) def resolve_workspace(config: Dict[str, Any]) -> str: @@ -37,7 +41,7 @@ def config(request: Any) -> None: config = json.load(f) if request.config.getoption("--mock"): - config["workspace"] = "agbenchmark/mocks/workspace" + config["workspace"] = "agbenchmark/workspace" elif isinstance(config["workspace"], str): config["workspace"] = resolve_workspace(config) else: # it's a input output dict @@ -77,9 +81,22 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]: def pytest_addoption(parser: Any) -> None: parser.addoption("--mock", action="store_true", default=False) + parser.addoption("--improve", action="store_true", default=False) + parser.addoption("--maintain", action="store_true", default=False) -regression_manager = RegressionManager(REGRESSION_TESTS_PATH) +@pytest.fixture(autouse=True) +def check_regression(request: Any) -> None: + test_name = request.node.parent.name + data = get_regression_data() + + # Check if the test name exists in the regression tests + if request.config.getoption("--improve") and data.get(test_name, None): + pytest.skip("Skipping test because it's a regression test and --improve is set") + elif request.config.getoption("--maintain") and not data.get(test_name, None): + pytest.skip( + "Skipping test because it's not a regression test and --maintain is set" + ) # this is to get the challenge_data from every test @@ -88,6 +105,9 @@ def challenge_data(request: Any) -> None: return request.param +regression_manager = RegressionManager(REGRESSION_TESTS_PATH) + + def pytest_runtest_makereport(item: Any, call: Any) -> None: if call.when == "call": challenge_data = item.funcargs.get("challenge_data", None) @@ -109,16 +129,6 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: regression_manager.remove_test(item.nodeid.split("::")[1]) -def pytest_collection_modifyitems(items: List[Any]) -> None: - """Called once all test items are collected. Used - to add regression and depends markers to collected test items.""" - for item in items: - # regression add - if item.nodeid.split("::")[1] in regression_manager.tests: - print(regression_manager.tests) - item.add_marker(pytest.mark.regression) - - def pytest_sessionfinish() -> None: """Called at the end of the session to save regression tests""" regression_manager.save() @@ -135,3 +145,29 @@ def pytest_generate_tests(metafunc: Any) -> None: # Add the parameters to the test function metafunc.parametrize("challenge_data", [params], indirect=True) + + +# this is adding the dependency marker and category markers automatically from the json +def pytest_collection_modifyitems(items: Any, config: Any) -> None: + data = get_regression_data() + + for item in items: + # Assuming item.cls is your test class + test_class_instance = item.cls() + + # Then you can access your properties + name = item.parent.cls.__name__ + dependencies = test_class_instance.data.dependencies + + # Filter dependencies if they exist in regression data if its an improvement test + if config.getoption("--improve"): + dependencies = [dep for dep in dependencies if not data.get(dep, None)] + + categories = test_class_instance.data.category + + # Add depends marker dynamically + item.add_marker(pytest.mark.depends(on=dependencies, name=name)) + + # Add category marker dynamically + for category in categories: + item.add_marker(getattr(pytest.mark, category)) diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py deleted file mode 100644 index 3a227e49b..000000000 --- a/agbenchmark/mocks/mock_manager.py +++ /dev/null @@ -1,28 +0,0 @@ -from typing import Any, Dict, Optional - -import agbenchmark.mocks.tests.basic_mocks as basic_mocks - - -class MockManager: - def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None: - self.task = task - self.workspace = config["workspace"] - self.modules = [basic_mocks] - - def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None: - if hasattr(self, mock_function_name): - # Check if the mock function is an attribute of this class - getattr(self, mock_function_name)(*args, **kwargs) - elif mock_function_name in globals(): - # Check if the function is imported in the file - func = globals()[mock_function_name] - func(self.task, self.workspace, *args, **kwargs) - elif len(self.modules) > 0: - # checks if function is in imported modules - for module in self.modules: - if hasattr(module, mock_function_name): - func = getattr(module, mock_function_name) - func(self.task, self.workspace, *args, **kwargs) - return - else: - raise ValueError(f"No such mock: {mock_function_name}") diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py deleted file mode 100644 index e4a1dedc0..000000000 --- a/agbenchmark/mocks/tests/basic_mocks.py +++ /dev/null @@ -1,12 +0,0 @@ -from agbenchmark.challenge import Challenge - - -def example_mock(task: str, workspace: str) -> None: - """ - This mock writes to a file (creates one if it doesn't exist) - """ - Challenge.write_to_file( - workspace, - "file_to_check.txt", - "This is an example showing how you can use mocks but here you can use artifacts_out folder instead of a mock.", - ) diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 295bbf4bf..f78e86a1c 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -2,11 +2,11 @@ import json import os import sys from pathlib import Path -from typing import List +from typing import Any import click import pytest -from dotenv import load_dotenv, set_key +from dotenv import load_dotenv load_dotenv() @@ -26,10 +26,17 @@ def cli() -> None: @cli.command() @click.option("--category", default=None, help="Specific category to run") @click.option("--maintain", is_flag=True, help="Runs only regression tests") +@click.option("--improve", is_flag=True, help="Run only non-regression tests") @click.option("--mock", is_flag=True, help="Run with mock") -def start(category: str, maintain: bool, mock: bool) -> int: +def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" # Check if configuration file exists and is not empty + if maintain and improve: + print( + "Error: You can't use both --maintain and --improve at the same time. Please choose one." + ) + return 1 + if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} @@ -55,7 +62,7 @@ def start(category: str, maintain: bool, mock: bool) -> int: with open(CONFIG_PATH, "r") as f: config = json.load(f) - set_key(".env", "MOCK_TEST", "True" if mock else "False") + os.environ["MOCK_TEST"] = "True" if mock else "False" if not os.path.exists(REGRESSION_TESTS_PATH): with open(REGRESSION_TESTS_PATH, "a"): @@ -65,42 +72,31 @@ def start(category: str, maintain: bool, mock: bool) -> int: for key, value in config.items(): print(f"{key}: {value}") - print("Starting benchmark tests...", category) - tests_to_run = [] pytest_args = ["-vs"] if category: pytest_args.extend(["-m", category]) + print("Starting benchmark tests ", category) else: - if maintain: - print("Running all regression tests") - tests_to_run = get_regression_tests() - else: - print("Running all categories") + print("Running all categories") + + if maintain: + print("Running only regression tests") + pytest_args.append("--maintain") + elif improve: + print("Running only non-regression tests") + pytest_args.append("--improve") if mock: pytest_args.append("--mock") - # Run pytest with the constructed arguments - if not tests_to_run: - tests_to_run = [str(CURRENT_DIRECTORY)] - pytest_args.extend(tests_to_run) - return sys.exit(pytest.main(pytest_args)) -def get_regression_tests() -> List[str]: - if not Path(REGRESSION_TESTS_PATH).exists(): - with open(REGRESSION_TESTS_PATH, "w") as file: - json.dump({}, file) - +def get_regression_data() -> Any: with open(REGRESSION_TESTS_PATH, "r") as file: data = json.load(file) - regression_tests = [ - str(CURRENT_DIRECTORY / ".." / value["test"]) for key, value in data.items() - ] - - return regression_tests + return data if __name__ == "__main__": diff --git a/agbenchmark/tests/basic_abilities/basic_challenge.py b/agbenchmark/tests/basic_abilities/basic_challenge.py deleted file mode 100644 index 8b3a4db1d..000000000 --- a/agbenchmark/tests/basic_abilities/basic_challenge.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest - -from agbenchmark.challenge import Challenge - - -@pytest.mark.basic -class BasicChallenge(Challenge): - pass diff --git a/agbenchmark/tests/basic_abilities/browse_test.py b/agbenchmark/tests/basic_abilities/browse_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt deleted file mode 100644 index 980a0d5f1..000000000 --- a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt +++ /dev/null @@ -1 +0,0 @@ -Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt deleted file mode 100644 index c1a7879a1..000000000 --- a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt +++ /dev/null @@ -1 +0,0 @@ -random string Hello World! diff --git a/agbenchmark/tests/basic_abilities/read_file/data.json b/agbenchmark/tests/basic_abilities/read_file/data.json deleted file mode 100644 index 7463d22fc..000000000 --- a/agbenchmark/tests/basic_abilities/read_file/data.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "name": "basic_read_file", - "category": ["basic"], - "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "dependencies": ["basic_write_file"], - "ground": { - "answer": "random string Hello World!", - "should_contain": ["random string", "Hello World!"], - "files": ["file_to_check.txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_read_file_mock" - }, - "info": { - "description": "This reads the file quickly", - "difficulty": "basic", - "side_effects": [""] - } -} diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py deleted file mode 100644 index cf5dceb69..000000000 --- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Any, Dict - -import pytest - -from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge - - -class TestReadFile(BasicChallenge): - """Testing if LLM can read a file""" - - @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file") - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - - assert 1 in scores diff --git a/agbenchmark/tests/basic_abilities/remember_context_test.py b/agbenchmark/tests/basic_abilities/remember_context_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt b/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt deleted file mode 100644 index 1f275fb98..000000000 --- a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt +++ /dev/null @@ -1 +0,0 @@ -Washington diff --git a/agbenchmark/tests/basic_abilities/write_file/data.json b/agbenchmark/tests/basic_abilities/write_file/data.json deleted file mode 100644 index 9232a45a0..000000000 --- a/agbenchmark/tests/basic_abilities/write_file/data.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "name": "basic_write_file", - "category": ["basic"], - "task": "Print the the capital of America to a .txt file", - "dependencies": [], - "ground": { - "answer": "Washington", - "should_contain": ["Washington"], - "should_not_contain": ["New York", "Los Angeles", "San Francisco"], - "files": [".txt"], - "type": "file" - }, - "mock": { - "mock_func": "basic_write_file_mock", - "mock_task": "What is the capital of America?" - }, - "info": { - "difficulty": "basic", - "description": "Tests the writing to file", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py deleted file mode 100644 index ba0395186..000000000 --- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py +++ /dev/null @@ -1,25 +0,0 @@ -from typing import Any, Dict - -import pytest - -from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge - - -class TestWriteFile(BasicChallenge): - """Testing if LLM can write to a file""" - - @pytest.mark.depends(name="basic_write_file") - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - files_contents = self.get_artifacts_out( - config["workspace"], self.data.ground.files - ) - - scores = [] - for file_content in files_contents: - score = self.scoring(file_content, self.data.ground) - print("Your score is:", score) - scores.append(score) - - assert 1 in scores diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py deleted file mode 100644 index a1379ecae..000000000 --- a/agbenchmark/tests/regression/RegressionManager.py +++ /dev/null @@ -1,29 +0,0 @@ -import json - - -class RegressionManager: - """Abstracts interaction with the regression tests file""" - - def __init__(self, filename: str): - self.filename = filename - self.load() - - def load(self) -> None: - try: - with open(self.filename, "r") as f: - self.tests = json.load(f) - except (FileNotFoundError, json.decoder.JSONDecodeError): - self.tests = {} - - def save(self) -> None: - with open(self.filename, "w") as f: - json.dump(self.tests, f, indent=4) - - def add_test(self, test_name: str, test_details: dict) -> None: - self.tests[test_name] = test_details - self.save() - - def remove_test(self, test_name: str) -> None: - if test_name in self.tests: - del self.tests[test_name] - self.save() diff --git a/agent/Auto-GPT b/agent/Auto-GPT index dd65cc256..5a36e43b7 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit dd65cc256ca72cb199fe8c5d6ae31c23a7acee62 +Subproject commit 5a36e43b782fdaef8a7270109f8347f0323211d2 diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 155ea895e..fd705f89a 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 155ea895eb5f7e44ed8647b335d90a03b5ffb06d +Subproject commit fd705f89afd53469c91935a9cae7b92a564025eb diff --git a/agent/smol-developer b/agent/smol-developer index 5a3ad4310..a1e4a9ff3 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit 5a3ad43103b238b9c8f2a2acceff250888be263e +Subproject commit a1e4a9ff3a75909c4a892e409a55f86a2c57b7c6 diff --git a/config.json b/config.json index 88526a134..8bbcebdbd 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { - "workspace": "projects/my-new-project/workspace", - "entry_path": "agent/gpt-engineer/benchmarks.py", - "home_path": "agent/gpt-engineer", + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "benchmarks.py", + "home_path": "agent/mini-agi", "cutoff": 60 } diff --git a/pyproject.toml b/pyproject.toml index 33a8671cf..a8f4f8dee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ testpaths = [ markers = [ "retrieval", "regression", - "basic", + "interface", "code", "memory" ] diff --git a/regression_tests.json b/regression_tests.json index 9714d42a8..44334801e 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,14 +1,4 @@ { - "TestDebugSimpleTypoWithGuidance": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py" - }, - "TestDebugSimpleTypoWithoutGuidance": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/code/d2/d2_test.py" - }, "TestBasicMemory": { "difficulty": "basic", "dependencies": [], @@ -16,44 +6,54 @@ }, "TestRememberMultipleIds": { "difficulty": "basic", - "dependencies": [], + "dependencies": [ + "TestBasicMemory" + ], "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" }, "TestRememberMultipleIdsWithNoise": { "difficulty": "medium", - "dependencies": [], + "dependencies": [ + "TestRememberMultipleIds" + ], "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIdsWithNoise" + ], + "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" + }, "TestRetrieval": { "difficulty": "basic", "dependencies": [], "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" }, - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py" - }, "TestRetrieval2": { "difficulty": "basic", - "dependencies": [], + "dependencies": [ + "TestRetrieval" + ], "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" }, - "TestReadFile": { + "TestRetrieval3": { "difficulty": "basic", "dependencies": [ - "basic_write_file" + "TestRetrieval2" ], - "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py" + "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" }, - "TestRetrieval3": { + "TestWriteFile": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" + "test": "agbenchmark/challenges/interface/write_file/write_file_test.py" }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [], - "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/read_file/read_file_test.py" } } \ No newline at end of file -- cgit v1.2.3 From d89264998d36251d8c471942da05b557fa26689d Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sat, 8 Jul 2023 18:46:37 -0700 Subject: Fix debug code challenge (#76) Co-authored-by: Silen Naihin --- agbenchmark/conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index ffbb26202..7d3dd8ed3 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -40,9 +40,7 @@ def config(request: Any) -> None: with open(CONFIG_PATH, "r") as f: config = json.load(f) - if request.config.getoption("--mock"): - config["workspace"] = "agbenchmark/workspace" - elif isinstance(config["workspace"], str): + if isinstance(config["workspace"], str): config["workspace"] = resolve_workspace(config) else: # it's a input output dict config["workspace"]["input"] = resolve_workspace(config) -- cgit v1.2.3 From 573130549fec6fe86194dec6cd9a2257dc5c5eec Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 9 Jul 2023 13:31:31 -0700 Subject: Add gpt engineer to ci (#78) --- .github/workflows/gpt-engineer.yml | 24 +++++++++++++++++++++--- agbenchmark/start_benchmark.py | 3 +++ agent/Auto-GPT | 2 +- agent/gpt-engineer | 2 +- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml index a39165482..833026e8e 100644 --- a/.github/workflows/gpt-engineer.yml +++ b/.github/workflows/gpt-engineer.yml @@ -5,6 +5,8 @@ on: branches: [master] push: branches: [stable, master, ci-test*] + pull_request: + branches: [stable, master, ci-test*] jobs: regression-tests: @@ -52,14 +54,30 @@ jobs: poetry install --only main poetry build - - name: Run regression tests + - name: Run regression tests (push) + if: ${{ github.event_name != 'pull_request' }} run: | cd agent/gpt-engineer make install source venv/bin/activate - pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --maintain + pip install ../../dist/*.whl + + if [ "${GITHUB_EVENT_NAME}" != "pull_request" ]; then + agbenchmark start --maintain + else + agbenchmark start --maintain --mock + agbenchmark start --improve --mock + agbenchmark start --mock + agbenchmark start --mock --category=retrieval + agbenchmark start --mock --category=regression + agbenchmark start --mock --category=interface + agbenchmark start --mock --category=code + agbenchmark start --mock --category=memory + agbenchmark start --mock --category=memory --category=code + fi + env: + GITHUB_EVENT_NAME: ${{ github.event_name }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - name: Upload logs as artifact diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index f78e86a1c..68c7932be 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -89,6 +89,9 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: if mock: pytest_args.append("--mock") + # when used as a library, the pytest directory to execute is in the CURRENT_DIRECTORY + pytest_args.append(str(CURRENT_DIRECTORY)) + return sys.exit(pytest.main(pytest_args)) diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 5a36e43b7..cec424ad2 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 5a36e43b782fdaef8a7270109f8347f0323211d2 +Subproject commit cec424ad2504020a830c3af9f74536a420545931 diff --git a/agent/gpt-engineer b/agent/gpt-engineer index fd705f89a..4af8c137e 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit fd705f89afd53469c91935a9cae7b92a564025eb +Subproject commit 4af8c137e82cc51fdd31c23327ceffd64194b984 -- cgit v1.2.3 From 3d43117554034a634f1c39018c6af6c69ed16fc9 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 9 Jul 2023 20:27:21 -0400 Subject: Just json, no test files (#77) --- agbenchmark/RegressionManager.py | 13 +++- agbenchmark/challenge.py | 26 ++------ agbenchmark/challenges/code/d1/data.json | 1 + .../d1/debug_simple_typo_with_guidance_test.py | 13 ---- agbenchmark/challenges/code/d2/d2_test.py | 14 ---- agbenchmark/challenges/code/d2/data.json | 1 + agbenchmark/challenges/define_task_types.py | 1 + .../challenges/interface/read_file/data.json | 2 +- .../interface/read_file/read_file_test.py | 12 ---- .../challenges/interface/write_file/data.json | 2 +- .../interface/write_file/write_file_test.py | 13 ---- agbenchmark/challenges/memory/m1/data.json | 1 + agbenchmark/challenges/memory/m1/m1_test.py | 13 ---- agbenchmark/challenges/memory/m2/data.json | 1 + .../memory/m2/remember_multiple_ids_test.py | 13 ---- agbenchmark/challenges/memory/m3/data.json | 1 + .../m3/remember_multiple_ids_with_noise_test.py | 13 ---- agbenchmark/challenges/memory/m4/data.json | 1 + .../remember_multiple_phrases_with_noise_test.py | 13 ---- agbenchmark/challenges/retrieval/r1/data.json | 1 + agbenchmark/challenges/retrieval/r1/r1_test.py | 13 ---- agbenchmark/challenges/retrieval/r2/data.json | 3 +- agbenchmark/challenges/retrieval/r2/r2_test.py | 13 ---- agbenchmark/challenges/retrieval/r3/data.json | 1 + agbenchmark/challenges/retrieval/r3/r3_test.py | 14 ---- agbenchmark/challenges/test_all.py | 78 ++++++++++++++++++++++ agbenchmark/conftest.py | 40 +++++------ regression_tests.json | 61 +++++++++-------- 28 files changed, 158 insertions(+), 220 deletions(-) delete mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py delete mode 100644 agbenchmark/challenges/code/d2/d2_test.py delete mode 100644 agbenchmark/challenges/interface/read_file/read_file_test.py delete mode 100644 agbenchmark/challenges/interface/write_file/write_file_test.py delete mode 100644 agbenchmark/challenges/memory/m1/m1_test.py delete mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py delete mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py delete mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py delete mode 100644 agbenchmark/challenges/retrieval/r1/r1_test.py delete mode 100644 agbenchmark/challenges/retrieval/r2/r2_test.py delete mode 100644 agbenchmark/challenges/retrieval/r3/r3_test.py create mode 100644 agbenchmark/challenges/test_all.py diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py index a1379ecae..e289a4787 100644 --- a/agbenchmark/RegressionManager.py +++ b/agbenchmark/RegressionManager.py @@ -11,9 +11,18 @@ class RegressionManager: def load(self) -> None: try: with open(self.filename, "r") as f: - self.tests = json.load(f) - except (FileNotFoundError, json.decoder.JSONDecodeError): + file_content = ( + f.read().strip() + ) # read the content and remove any leading/trailing whitespace + if file_content: # if file is not empty, load the json + self.tests = json.loads(file_content) + else: # if file is empty, assign an empty dictionary + self.tests = {} + except FileNotFoundError: self.tests = {} + except json.decoder.JSONDecodeError: # If JSON is invalid + self.tests = {} + self.save() def save(self) -> None: with open(self.filename, "w") as f: diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index ddf69f42d..cf7ce104c 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -1,10 +1,8 @@ import glob -import inspect import os import subprocess -import types -from abc import ABC, ABCMeta -from typing import Any, Dict, List, Tuple, Type, cast +from abc import ABC +from typing import Any, Dict, List from dotenv import load_dotenv @@ -16,24 +14,12 @@ mock_test_str = os.getenv("MOCK_TEST") MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False -class ChallengeMeta(ABCMeta): - def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None: - super().__init__(name, bases, dct) - try: - frame = cast(types.FrameType, inspect.currentframe()) - assert frame.f_back is not None - self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back)) - except Exception as e: - print(f"Unable to get the file from 8 frames back due to: {str(e)}") - raise e - - -class Challenge(ABC, metaclass=ChallengeMeta): +class Challenge(ABC): """The parent class to all specific challenges classes. Defines helper methods for running a challenge""" _data_cache: Dict[str, ChallengeData] = {} - CHALLENGE_LOCATION: str + CHALLENGE_LOCATION: str = "" @property def data(self) -> ChallengeData: @@ -54,10 +40,10 @@ class Challenge(ABC, metaclass=ChallengeMeta): from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent copy_artifacts_into_workspace( - config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION + config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION ) - run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION) + run_agent(self.task, config, self.CHALLENGE_LOCATION) def test_method(self, config: Dict[str, Any]) -> None: raise NotImplementedError diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index 6ac284b81..0c7246000 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -1,4 +1,5 @@ { + "name": "TestDebugSimpleTypoWithGuidance", "category": ["code"], "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": ["TestReadFile", "TestWriteFile"], diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py deleted file mode 100644 index d104b3374..000000000 --- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestDebugSimpleTypoWithGuidance(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py deleted file mode 100644 index b02114a75..000000000 --- a/agbenchmark/challenges/code/d2/d2_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestDebugSimpleTypoWithoutGuidance(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - - assert 1 in scores diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index 3de5111f5..292301094 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -1,4 +1,5 @@ { + "name": "TestDebugSimpleTypoWithoutGuidance", "category": ["code"], "task": "Make test.py run without errors.", "dependencies": ["TestDebugSimpleTypoWithGuidance"], diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 308cb5ea6..94cba5b72 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -19,6 +19,7 @@ class Ground(BaseModel): class ChallengeData(BaseModel): + name: str category: List[str] task: str dependencies: List[str] diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json index dd399fabf..c827581b6 100644 --- a/agbenchmark/challenges/interface/read_file/data.json +++ b/agbenchmark/challenges/interface/read_file/data.json @@ -1,5 +1,5 @@ { - "name": "ReadFile", + "name": "TestReadFile", "category": ["interface"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["TestWriteFile"], diff --git a/agbenchmark/challenges/interface/read_file/read_file_test.py b/agbenchmark/challenges/interface/read_file/read_file_test.py deleted file mode 100644 index 591d0a744..000000000 --- a/agbenchmark/challenges/interface/read_file/read_file_test.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestReadFile(Challenge): - """Testing if LLM can read a file""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json index b3e4b6f02..2be2d0dfe 100644 --- a/agbenchmark/challenges/interface/write_file/data.json +++ b/agbenchmark/challenges/interface/write_file/data.json @@ -1,5 +1,5 @@ { - "name": "WriteFile", + "name": "TestWriteFile", "category": ["interface"], "task": "Print the the capital of America to a .txt file", "dependencies": [], diff --git a/agbenchmark/challenges/interface/write_file/write_file_test.py b/agbenchmark/challenges/interface/write_file/write_file_test.py deleted file mode 100644 index 4a52b0979..000000000 --- a/agbenchmark/challenges/interface/write_file/write_file_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestWriteFile(Challenge): - """Testing if LLM can write to a file""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index f771a2669..506b246ad 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -1,4 +1,5 @@ { + "name": "TestBasicMemory", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestReadFile", "TestWriteFile"], diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py deleted file mode 100644 index 0fc537eeb..000000000 --- a/agbenchmark/challenges/memory/m1/m1_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestBasicMemory(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 998e894b1..7ef2552d1 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -1,4 +1,5 @@ { + "name": "TestRememberMultipleIds", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestBasicMemory"], diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py deleted file mode 100644 index c88f28831..000000000 --- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRememberMultipleIds(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index d5d95b1de..720cce93c 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -1,4 +1,5 @@ { + "name": "TestRememberMultipleIdsWithNoise", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIds"], diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py deleted file mode 100644 index 0e35dd2f4..000000000 --- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRememberMultipleIdsWithNoise(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index 49831537e..61965206b 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -1,4 +1,5 @@ { + "name": "TestRememberMultiplePhrasesWithNoise", "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIdsWithNoise"], diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py deleted file mode 100644 index 4c4bdce55..000000000 --- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRememberMultiplePhrasesWithNoise(Challenge): - """The first memory challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index 6e1344b8b..7812c21da 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -1,4 +1,5 @@ { + "name": "TestBasicRetrieval", "category": ["retrieval"], "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", "dependencies": ["TestWriteFile"], diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py deleted file mode 100644 index 9845a7b2a..000000000 --- a/agbenchmark/challenges/retrieval/r1/r1_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRetrieval(Challenge): - """The first information-retrieval challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json index 05846b9f3..5bc2e96b4 100644 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -1,7 +1,8 @@ { + "name": "TestRetrieval2", "category": ["retrieval"], "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": ["TestRetrieval"], + "dependencies": ["TestBasicRetrieval"], "ground": { "answer": "81,462", "should_contain": ["81,462"], diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py deleted file mode 100644 index f0f13ffbf..000000000 --- a/agbenchmark/challenges/retrieval/r2/r2_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRetrieval2(Challenge): - """The first information-retrieval challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - assert 1 in scores diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index 763c963ec..b918d3d4e 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -1,4 +1,5 @@ { + "name": "TestRetrieval3", "category": ["retrieval"], "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", "dependencies": ["TestRetrieval2"], diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py deleted file mode 100644 index 5887c0b43..000000000 --- a/agbenchmark/challenges/retrieval/r3/r3_test.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Any, Dict - -from agbenchmark.challenge import Challenge - - -class TestRetrieval3(Challenge): - """The first information-retrieval challenge""" - - def test_method(self, config: Dict[str, Any]) -> None: - self.setup_challenge(config) - - scores = self.get_scores(config) - - assert 1 in scores diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py new file mode 100644 index 000000000..4f9e5b7f8 --- /dev/null +++ b/agbenchmark/challenges/test_all.py @@ -0,0 +1,78 @@ +import glob +import importlib +import json +import os +import types +from pathlib import Path +from typing import Any, Dict + +import pytest +from dotenv import load_dotenv + +from agbenchmark.challenge import Challenge + +load_dotenv() + +IMPROVE = os.getenv("IMPROVE", "False") + + +json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True) + + +def get_test_path(json_file: str) -> str: + abs_location = os.path.dirname(os.path.abspath(json_file)) + + path = Path(abs_location) + + # Find the index of "agbenchmark" in the path parts + try: + agbenchmark_index = path.parts.index("agbenchmark") + except ValueError: + raise ValueError("Invalid challenge location.") + + # Create the path from "agbenchmark" onwards + challenge_location = Path(*path.parts[agbenchmark_index:]) + + return str(challenge_location) + + +def generate_tests() -> None: + print("Generating tests...") + # Dynamic class creation + for json_file in json_files: + with open(json_file, "r") as f: + data = json.load(f) + + class_name = data.get("name", "") + + challenge_location = get_test_path(json_file) + + # Define test class dynamically + challenge_class = types.new_class(class_name, (Challenge,)) + + setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location) + + # Define test method within the dynamically created class + def test_method(self, config: Dict[str, Any]) -> None: # type: ignore + self.setup_challenge(config) + + scores = self.get_scores(config) + assert 1 in scores + + # Parametrize the method here + test_method = pytest.mark.parametrize( + "challenge_data", + [data], + indirect=True, + )(test_method) + + setattr(challenge_class, "test_method", test_method) + + # Attach the new class to a module so it can be discovered by pytest + module = importlib.import_module(__name__) + setattr(module, class_name, challenge_class) + + print(f"Generated test for {class_name}.") + + +generate_tests() diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 7d3dd8ed3..e321f5a26 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -88,13 +88,16 @@ def check_regression(request: Any) -> None: test_name = request.node.parent.name data = get_regression_data() + # Get the true location of the test + challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "") + + skip_string = f"Skipping {test_name} at {challenge_location}" + # Check if the test name exists in the regression tests if request.config.getoption("--improve") and data.get(test_name, None): - pytest.skip("Skipping test because it's a regression test and --improve is set") + pytest.skip(f"{skip_string} because it's a regression test") elif request.config.getoption("--maintain") and not data.get(test_name, None): - pytest.skip( - "Skipping test because it's not a regression test and --maintain is set" - ) + pytest.skip(f"{skip_string} because it's not a regression test") # this is to get the challenge_data from every test @@ -109,15 +112,19 @@ regression_manager = RegressionManager(REGRESSION_TESTS_PATH) def pytest_runtest_makereport(item: Any, call: Any) -> None: if call.when == "call": challenge_data = item.funcargs.get("challenge_data", None) - difficulty = challenge_data.info.difficulty if challenge_data else "unknown" - dependencies = challenge_data.dependencies if challenge_data else [] - parts = item.nodeid.split("::")[0].split("/") - agbenchmark_index = parts.index("agbenchmark") - file_path = "/".join(parts[agbenchmark_index:]) + difficulty = ( + challenge_data["info"]["difficulty"] if challenge_data else "unknown" + ) + dependencies = dependencies = ( + challenge_data["dependencies"] if challenge_data else [] + ) + # Extract the challenge_location from the class + challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") + test_details = { "difficulty": difficulty, "dependencies": dependencies, - "test": file_path, + "test": challenge_location, } print("pytest_runtest_makereport", test_details) @@ -132,19 +139,6 @@ def pytest_sessionfinish() -> None: regression_manager.save() -# this is so that all tests can inherit from the Challenge class -def pytest_generate_tests(metafunc: Any) -> None: - if "challenge_data" in metafunc.fixturenames: - # Get the instance of the test class - test_class = metafunc.cls() - - # Generate the parameters - params = test_class.data - - # Add the parameters to the test function - metafunc.parametrize("challenge_data", [params], indirect=True) - - # this is adding the dependency marker and category markers automatically from the json def pytest_collection_modifyitems(items: Any, config: Any) -> None: data = get_regression_data() diff --git a/regression_tests.json b/regression_tests.json index 44334801e..613207917 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,59 +1,64 @@ { - "TestBasicMemory": { + "TestWriteFile": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark/challenges/memory/m1/m1_test.py" + "test": "agbenchmark\\challenges\\interface\\write_file" }, - "TestRememberMultipleIds": { + "TestReadFile": { "difficulty": "basic", "dependencies": [ - "TestBasicMemory" + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py" + "test": "agbenchmark\\challenges\\interface\\read_file" }, - "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", + "TestBasicMemory": { + "difficulty": "basic", "dependencies": [ - "TestRememberMultipleIds" + "TestReadFile", + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py" + "test": "agbenchmark\\challenges\\memory\\m1" }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", + "TestBasicRetrieval": { + "difficulty": "basic", "dependencies": [ - "TestRememberMultipleIdsWithNoise" + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py" + "test": "agbenchmark\\challenges\\retrieval\\r1" }, - "TestRetrieval": { + "TestRememberMultipleIds": { "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/retrieval/r1/r1_test.py" + "dependencies": [ + "TestBasicMemory" + ], + "test": "agbenchmark\\challenges\\memory\\m2" }, "TestRetrieval2": { "difficulty": "basic", "dependencies": [ - "TestRetrieval" + "TestBasicRetrieval" + ], + "test": "agbenchmark\\challenges\\retrieval\\r2" + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIds" ], - "test": "agbenchmark/challenges/retrieval/r2/r2_test.py" + "test": "agbenchmark\\challenges\\memory\\m3" }, "TestRetrieval3": { "difficulty": "basic", "dependencies": [ "TestRetrieval2" ], - "test": "agbenchmark/challenges/retrieval/r3/r3_test.py" + "test": "agbenchmark\\challenges\\retrieval\\r3" }, - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file/write_file_test.py" - }, - "TestReadFile": { - "difficulty": "basic", + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", "dependencies": [ - "TestWriteFile" + "TestRememberMultipleIdsWithNoise" ], - "test": "agbenchmark/challenges/interface/read_file/read_file_test.py" + "test": "agbenchmark\\challenges\\memory\\m4" } } \ No newline at end of file -- cgit v1.2.3 From 0fa5286ad0e06fc5089b7002a930f752227c2061 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 9 Jul 2023 18:06:26 -0700 Subject: Combine all agents into one ci.yml (#79) Signed-off-by: Merwane Hamadi --- .github/workflows/autogpt.yml | 64 -------------------------- .github/workflows/ci.yml | 76 ++++++++++++++++++++++++++++++- .github/workflows/gpt-engineer.yml | 88 ------------------------------------ .github/workflows/mini-agi.yml | 66 --------------------------- .github/workflows/smol-developer.yml | 64 -------------------------- .github/workflows/superagi.yml | 62 ------------------------- 6 files changed, 74 insertions(+), 346 deletions(-) delete mode 100644 .github/workflows/autogpt.yml delete mode 100644 .github/workflows/gpt-engineer.yml delete mode 100644 .github/workflows/mini-agi.yml delete mode 100644 .github/workflows/smol-developer.yml delete mode 100644 .github/workflows/superagi.yml diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml deleted file mode 100644 index 2d7e2dfbd..000000000 --- a/.github/workflows/autogpt.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: Auto-GPT Regression Test - -on: - workflow_dispatch: - branches: [master] - push: - branches: [stable, master, ci-test*] - -jobs: - regression-tests: - permissions: - pull-requests: write - contents: write - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10'] - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - id: get_date - name: Get date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - - name: Set up venv and install Python dependencies - run: | - poetry install --only main - poetry build - - - name: Run regression tests - run: | - python -m venv venv - source venv/bin/activate - cd agent/Auto-GPT - pip install -r requirements.txt - pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --maintain - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6a0f4503a..d989389db 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,10 @@ -name: Python CI +name: CI on: + workflow_dispatch: + branches: [ master ] + schedule: + - cron: '0 8 * * *' push: branches: [ master, ci-test* ] pull_request: @@ -20,6 +24,7 @@ jobs: fetch-depth: 0 ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true - name: Set up Python ${{ env.min-python-version }} uses: actions/setup-python@v2 @@ -68,10 +73,18 @@ jobs: if: success() || failure() tests: - + name: ${{ matrix.agent-name }} runs-on: ubuntu-latest env: min-python-version: "3.10" + strategy: + fail-fast: false + matrix: + agent-name: + - "gpt-engineer" + - "Auto-GPT" + - "mini-agi" + - "smol-developer" steps: - name: Checkout repository @@ -80,6 +93,7 @@ jobs: fetch-depth: 0 ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} + submodules: true - name: Set up Python ${{ env.min-python-version }} uses: actions/setup-python@v2 @@ -107,3 +121,61 @@ jobs: poetry install poetry run agbenchmark start --mock poetry run agbenchmark start --mock --maintain + poetry build + + - name: Run regression tests + run: | + cd agent/$AGENT_NAME + if [ "$AGENT_NAME" == "gpt-engineer" ]; then + make install + source venv/bin/activate + elif [ "$AGENT_NAME" == "Auto-GPT" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + elif [ "$AGENT_NAME" == "mini-agi" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + cp .env_example .env + elif [ "$AGENT_NAME" == "smol-developer" ]; then + python -m venv venv + source venv/bin/activate + pip install -r requirements.txt + elif [ "$AGENT_NAME" == "SuperAGI" ]; then + cp config_template.yaml config.yaml + sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml + docker-compose up -d --build + else + echo "Unknown agent name: $AGENT_NAME" + exit 1 + fi + + pip install ../../dist/*.whl + + if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then + agbenchmark start --maintain + else + exit 0 + agbenchmark start --maintain --mock + agbenchmark start --improve --mock + agbenchmark start --mock + agbenchmark start --mock --category=retrieval + agbenchmark start --mock --category=regression + agbenchmark start --mock --category=interface + agbenchmark start --mock --category=code + agbenchmark start --mock --category=memory + agbenchmark start --mock --category=memory --category=code + fi + env: + GITHUB_EVENT_NAME: ${{ github.event_name }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + AGENT_NAME: ${{ matrix.agent-name }} + PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards. + + - name: Upload logs as artifact + if: always() + uses: actions/upload-artifact@v3 + with: + name: gpt-engineer-projects + path: agent/gpt-engineer/projects diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml deleted file mode 100644 index 833026e8e..000000000 --- a/.github/workflows/gpt-engineer.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: gpt-engineer Regression Test - -on: - workflow_dispatch: - branches: [master] - push: - branches: [stable, master, ci-test*] - pull_request: - branches: [stable, master, ci-test*] - -jobs: - regression-tests: - permissions: - pull-requests: write - contents: write - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10'] - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - id: get_date - name: Get date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - - name: Set up venv and install Python dependencies - run: | - poetry install --only main - poetry build - - - name: Run regression tests (push) - if: ${{ github.event_name != 'pull_request' }} - run: | - cd agent/gpt-engineer - make install - source venv/bin/activate - pip install ../../dist/*.whl - - if [ "${GITHUB_EVENT_NAME}" != "pull_request" ]; then - agbenchmark start --maintain - else - agbenchmark start --maintain --mock - agbenchmark start --improve --mock - agbenchmark start --mock - agbenchmark start --mock --category=retrieval - agbenchmark start --mock --category=regression - agbenchmark start --mock --category=interface - agbenchmark start --mock --category=code - agbenchmark start --mock --category=memory - agbenchmark start --mock --category=memory --category=code - fi - - env: - GITHUB_EVENT_NAME: ${{ github.event_name }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - - - name: Upload logs as artifact - if: always() - uses: actions/upload-artifact@v3 - with: - name: gpt-engineer-projects - path: agent/gpt-engineer/projects diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml deleted file mode 100644 index 53c479df4..000000000 --- a/.github/workflows/mini-agi.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: mini-agi Regression Test - -on: - workflow_dispatch: - branches: [master] - push: - branches: [stable, master, ci-test*] - -jobs: - regression-tests: - permissions: - pull-requests: write - contents: write - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10'] - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - id: get_date - name: Get date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - - name: Set up venv and install Python dependencies - run: | - poetry install --only main - poetry build - - - name: Run regression tests - run: | - cd agent/mini-agi - python -m venv venv - source venv/bin/activate - pip install -r requirements.txt - cp .env_example .env - pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --maintain - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PROMPT_USER: false diff --git a/.github/workflows/smol-developer.yml b/.github/workflows/smol-developer.yml deleted file mode 100644 index 6926df54b..000000000 --- a/.github/workflows/smol-developer.yml +++ /dev/null @@ -1,64 +0,0 @@ -name: smol developer Regression Test - -on: - workflow_dispatch: - branches: [master] - push: - branches: [stable, master, ci-test*] - -jobs: - regression-tests: - permissions: - pull-requests: write - contents: write - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10'] - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - id: get_date - name: Get date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - - name: Set up venv and install Python dependencies - run: | - poetry install --only main - poetry build - - - name: Run regression tests - run: | - cd agent/smol-developer - python -m venv venv - source venv/bin/activate - pip install -r requirements.txt - pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --maintain - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml deleted file mode 100644 index 5ab52d33f..000000000 --- a/.github/workflows/superagi.yml +++ /dev/null @@ -1,62 +0,0 @@ -name: SuperAgi Regression Test - -on: - workflow_dispatch: - branches: [master] - push: - branches: [stable, master, ci-test*] - -jobs: - regression-tests: - permissions: - pull-requests: write - contents: write - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10'] - - steps: - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.ref }} - repository: ${{ github.event.pull_request.head.repo.full_name }} - submodules: true - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - - id: get_date - name: Get date - run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT - - - name: Install Poetry - run: | - curl -sSL https://install.python-poetry.org | python - - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - - name: Set up venv and install Python dependencies - run: | - poetry install --only main - poetry build - - - name: Run regression tests - run: | - cd agent/SuperAGI - cp config_template.yaml config.yaml - sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml - docker-compose up -d --build - pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl - agbenchmark start --maintain -- cgit v1.2.3 From b8830f86256ce54c990fc4bd4a0fe2ac7389cdbd Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 9 Jul 2023 21:33:08 -0400 Subject: Adding search interface challenge and cleaning repo (#80) --- agbenchmark/challenges/adaptability/a1_test.py | 0 agbenchmark/challenges/interface/browse_test.py | 0 .../interface/search/artifacts_out/random_file.txt | 2 ++ agbenchmark/challenges/interface/search/data.json | 18 ++++++++++++++++++ agbenchmark/challenges/retrieval/r1/data.json | 2 +- agbenchmark/challenges/web_navigation/wn1_test.py | 0 agbenchmark/challenges/writing/w1_test.py | 0 regression_tests.json | 5 +++++ 8 files changed, 26 insertions(+), 1 deletion(-) delete mode 100644 agbenchmark/challenges/adaptability/a1_test.py delete mode 100644 agbenchmark/challenges/interface/browse_test.py create mode 100644 agbenchmark/challenges/interface/search/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/interface/search/data.json delete mode 100644 agbenchmark/challenges/web_navigation/wn1_test.py delete mode 100644 agbenchmark/challenges/writing/w1_test.py diff --git a/agbenchmark/challenges/adaptability/a1_test.py b/agbenchmark/challenges/adaptability/a1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/interface/browse_test.py b/agbenchmark/challenges/interface/browse_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/interface/search/artifacts_out/random_file.txt b/agbenchmark/challenges/interface/search/artifacts_out/random_file.txt new file mode 100644 index 000000000..035667591 --- /dev/null +++ b/agbenchmark/challenges/interface/search/artifacts_out/random_file.txt @@ -0,0 +1,2 @@ +This is a Heading +This is a paragraph. \ No newline at end of file diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json new file mode 100644 index 000000000..17ee1ac1a --- /dev/null +++ b/agbenchmark/challenges/interface/search/data.json @@ -0,0 +1,18 @@ +{ + "name": "TestSearch", + "category": ["interface"], + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "dependencies": [], + "ground": { + "answer": "This is a Heading\nThis is a paragraph.", + "should_contain": ["Heading", "paragraph"], + "should_not_contain": ["The", "the"], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "basic", + "description": "Tests if an llm can search", + "side_effects": [""] + } +} diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index 7812c21da..4f3833dfc 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -2,7 +2,7 @@ "name": "TestBasicRetrieval", "category": ["retrieval"], "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", - "dependencies": ["TestWriteFile"], + "dependencies": ["TestWriteFile", "TestSearch"], "ground": { "answer": "£25.89", "should_contain": ["25.89"], diff --git a/agbenchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/challenges/web_navigation/wn1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/writing/w1_test.py b/agbenchmark/challenges/writing/w1_test.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/regression_tests.json b/regression_tests.json index 613207917..10a6e11bf 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -60,5 +60,10 @@ "TestRememberMultipleIdsWithNoise" ], "test": "agbenchmark\\challenges\\memory\\m4" + }, + "TestSearch": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark\\challenges\\interface\\search" } } \ No newline at end of file -- cgit v1.2.3 From 9adcad8b8aefd20ae62d0826f5c17394b352d09c Mon Sep 17 00:00:00 2001 From: James Collins Date: Sun, 9 Jul 2023 19:32:04 -0700 Subject: Fix regression: restore api_base and organization configurability (#4933) --- autogpt/config/config.py | 13 ++++++++++++- autogpt/llm/utils/__init__.py | 18 ++++-------------- autogpt/memory/vector/utils.py | 7 ++----- tests/unit/test_config.py | 26 ++++++++++++++++++++------ 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/autogpt/config/config.py b/autogpt/config/config.py index 05590eb6a..b1ff0a0ab 100644 --- a/autogpt/config/config.py +++ b/autogpt/config/config.py @@ -86,7 +86,18 @@ class Config(SystemSettings): plugins: list[str] authorise_key: str - def get_azure_kwargs(self, model: str) -> dict[str, str]: + def get_openai_credentials(self, model: str) -> dict[str, str]: + credentials = { + "api_key": self.openai_api_key, + "api_base": self.openai_api_base, + "organization": self.openai_organization, + } + if self.use_azure: + azure_credentials = self.get_azure_credentials(model) + credentials.update(azure_credentials) + return credentials + + def get_azure_credentials(self, model: str) -> dict[str, str]: """Get the kwargs for the Azure API.""" # Fix --gpt3only and --gpt4only in combination with Azure diff --git a/autogpt/llm/utils/__init__.py b/autogpt/llm/utils/__init__.py index 3c2835b7c..e0ff1473f 100644 --- a/autogpt/llm/utils/__init__.py +++ b/autogpt/llm/utils/__init__.py @@ -71,17 +71,14 @@ def create_text_completion( if temperature is None: temperature = config.temperature - if config.use_azure: - kwargs = config.get_azure_kwargs(model) - else: - kwargs = {"model": model} + kwargs = {"model": model} + kwargs.update(config.get_openai_credentials(model)) response = iopenai.create_text_completion( prompt=prompt, **kwargs, temperature=temperature, max_tokens=max_output_tokens, - api_key=config.openai_api_key, ) logger.debug(f"Response: {response}") @@ -137,9 +134,7 @@ def create_chat_completion( if message is not None: return message - chat_completion_kwargs["api_key"] = config.openai_api_key - if config.use_azure: - chat_completion_kwargs.update(config.get_azure_kwargs(model)) + chat_completion_kwargs.update(config.get_openai_credentials(model)) if functions: chat_completion_kwargs["functions"] = [ @@ -179,12 +174,7 @@ def check_model( config: Config, ) -> str: """Check if model is available for use. If not, return gpt-3.5-turbo.""" - openai_credentials = { - "api_key": config.openai_api_key, - } - if config.use_azure: - openai_credentials.update(config.get_azure_kwargs(model_name)) - + openai_credentials = config.get_openai_credentials(model_name) api_manager = ApiManager() models = api_manager.get_models(**openai_credentials) diff --git a/autogpt/memory/vector/utils.py b/autogpt/memory/vector/utils.py index 74438f28c..eb6912566 100644 --- a/autogpt/memory/vector/utils.py +++ b/autogpt/memory/vector/utils.py @@ -41,10 +41,8 @@ def get_embedding( input = [text.replace("\n", " ") for text in input] model = config.embedding_model - if config.use_azure: - kwargs = config.get_azure_kwargs(model) - else: - kwargs = {"model": model} + kwargs = {"model": model} + kwargs.update(config.get_openai_credentials(model)) logger.debug( f"Getting embedding{f's for {len(input)} inputs' if multiple else ''}" @@ -57,7 +55,6 @@ def get_embedding( embeddings = iopenai.create_embedding( input, **kwargs, - api_key=config.openai_api_key, ).data if not multiple: diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index b441aa948..7abbfcd52 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -174,18 +174,32 @@ azure_model_map: fast_llm = config.fast_llm smart_llm = config.smart_llm - assert config.get_azure_kwargs(config.fast_llm)["deployment_id"] == "FAST-LLM_ID" - assert config.get_azure_kwargs(config.smart_llm)["deployment_id"] == "SMART-LLM_ID" + assert ( + config.get_azure_credentials(config.fast_llm)["deployment_id"] == "FAST-LLM_ID" + ) + assert ( + config.get_azure_credentials(config.smart_llm)["deployment_id"] + == "SMART-LLM_ID" + ) # Emulate --gpt4only config.fast_llm = smart_llm - assert config.get_azure_kwargs(config.fast_llm)["deployment_id"] == "SMART-LLM_ID" - assert config.get_azure_kwargs(config.smart_llm)["deployment_id"] == "SMART-LLM_ID" + assert ( + config.get_azure_credentials(config.fast_llm)["deployment_id"] == "SMART-LLM_ID" + ) + assert ( + config.get_azure_credentials(config.smart_llm)["deployment_id"] + == "SMART-LLM_ID" + ) # Emulate --gpt3only config.fast_llm = config.smart_llm = fast_llm - assert config.get_azure_kwargs(config.fast_llm)["deployment_id"] == "FAST-LLM_ID" - assert config.get_azure_kwargs(config.smart_llm)["deployment_id"] == "FAST-LLM_ID" + assert ( + config.get_azure_credentials(config.fast_llm)["deployment_id"] == "FAST-LLM_ID" + ) + assert ( + config.get_azure_credentials(config.smart_llm)["deployment_id"] == "FAST-LLM_ID" + ) del os.environ["USE_AZURE"] del os.environ["AZURE_CONFIG_FILE"] -- cgit v1.2.3 From 4d514694738eb1a9a581136e85cb6aeb0ba27d63 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Mon, 10 Jul 2023 18:13:59 +0200 Subject: Fix CI cassette checkout --- .github/workflows/ci.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3e21d1d70..dde98cf91 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -108,22 +108,27 @@ jobs: if: ${{ startsWith(github.event_name, 'pull_request') }} run: | cassette_branch="${{ github.event.pull_request.user.login }}-${{ github.event.pull_request.head.ref }}" + cassette_base_branch="${{ github.event.pull_request.base.ref }}" cd tests/Auto-GPT-test-cassettes + if ! git ls-remote --exit-code --heads origin $cassette_base_branch ; then + cassette_base_branch="master" + fi + if git ls-remote --exit-code --heads origin $cassette_branch ; then git fetch origin $cassette_branch - git fetch origin ${{ github.event.pull_request.base.ref }} + git fetch origin $cassette_base_branch git checkout $cassette_branch # Pick non-conflicting cassette updates from the base branch - git merge --no-commit --strategy-option=ours origin/${{ github.event.pull_request.base.ref }} + git merge --no-commit --strategy-option=ours origin/$cassette_base_branch echo "Using cassettes from mirror branch '$cassette_branch'," \ - "synced to upstream branch '${{ github.event.pull_request.base.ref }}'." + "synced to upstream branch '$cassette_base_branch'." else git checkout -b $cassette_branch echo "Branch '$cassette_branch' does not exist in cassette submodule." \ - "Using cassettes from '${{ github.event.pull_request.base.ref }}'." + "Using cassettes from '$cassette_base_branch'." fi - name: Set up Python ${{ matrix.python-version }} -- cgit v1.2.3 From 30ba51593f277711148da30f465417adb848472c Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 10 Jul 2023 09:19:12 -0700 Subject: Add Helicone (#81) --- .github/workflows/ci.yml | 1 + agbenchmark/challenge.py | 3 ++- agbenchmark/challenges/test_all.py | 3 ++- agent/Auto-GPT | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d989389db..cac1dedb1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -172,6 +172,7 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} AGENT_NAME: ${{ matrix.agent-name }} PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards. + HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }} - name: Upload logs as artifact if: always() diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index cf7ce104c..aeebd7ad8 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -7,6 +7,7 @@ from typing import Any, Dict, List from dotenv import load_dotenv from agbenchmark.challenges.define_task_types import ChallengeData, Ground +from agbenchmark.start_benchmark import CURRENT_DIRECTORY load_dotenv() @@ -23,7 +24,7 @@ class Challenge(ABC): @property def data(self) -> ChallengeData: - file_path = f"{self.CHALLENGE_LOCATION}/data.json" + file_path = f"{CURRENT_DIRECTORY}/../{self.CHALLENGE_LOCATION}/data.json" if file_path not in Challenge._data_cache: Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path) return Challenge._data_cache[file_path] diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 4f9e5b7f8..e7fe99e73 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -10,13 +10,14 @@ import pytest from dotenv import load_dotenv from agbenchmark.challenge import Challenge +from agbenchmark.start_benchmark import CURRENT_DIRECTORY load_dotenv() IMPROVE = os.getenv("IMPROVE", "False") -json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True) +json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True) def get_test_path(json_file: str) -> str: diff --git a/agent/Auto-GPT b/agent/Auto-GPT index cec424ad2..f360d503b 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit cec424ad2504020a830c3af9f74536a420545931 +Subproject commit f360d503b113119f6b3ce0acff1dbb4dfae2223a -- cgit v1.2.3 From 437e066a66c4f3d6aeba26f79fe1c3d8e4ea5743 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 10 Jul 2023 17:46:03 -0700 Subject: Add "Simple web server" challenge (#74) Co-authored-by: Silen Naihin --- agbenchmark/RegressionManager.py | 15 ++++- .../challenges/code/d3/custom_python/api_tests.py | 27 ++++++++ agbenchmark/challenges/code/d3/data.json | 18 ++++++ agbenchmark/challenges/test_all.py | 25 +++++++- poetry.lock | 31 ++++++++- pyproject.toml | 1 + regression_tests.json | 73 ++++++++++++++-------- 7 files changed, 160 insertions(+), 30 deletions(-) create mode 100644 agbenchmark/challenges/code/d3/custom_python/api_tests.py create mode 100644 agbenchmark/challenges/code/d3/data.json diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py index e289a4787..ac9efc696 100644 --- a/agbenchmark/RegressionManager.py +++ b/agbenchmark/RegressionManager.py @@ -1,4 +1,5 @@ import json +from typing import Union class RegressionManager: @@ -15,7 +16,9 @@ class RegressionManager: f.read().strip() ) # read the content and remove any leading/trailing whitespace if file_content: # if file is not empty, load the json - self.tests = json.loads(file_content) + data = json.loads(file_content) + self.tests = {k: data[k] for k in sorted(data)} + data = self.replace_backslash(data) else: # if file is empty, assign an empty dictionary self.tests = {} except FileNotFoundError: @@ -36,3 +39,13 @@ class RegressionManager: if test_name in self.tests: del self.tests[test_name] self.save() + + def replace_backslash(self, value: str) -> Union[str, list[str], dict]: + if isinstance(value, str): + return value.replace("\\\\", "/") # escape \ with \\ + elif isinstance(value, list): + return [self.replace_backslash(i) for i in value] + elif isinstance(value, dict): + return {k: self.replace_backslash(v) for k, v in value.items()} + else: + return value diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d3/custom_python/api_tests.py new file mode 100644 index 000000000..1d6255ebd --- /dev/null +++ b/agbenchmark/challenges/code/d3/custom_python/api_tests.py @@ -0,0 +1,27 @@ +import os +from typing import Any, Dict +from unittest.mock import Mock, patch + +import requests + + +def make_assertion() -> None: + if os.environ.get("MOCK_TEST", "False").lower() == "true": + mock_response = Mock(requests.Response) + mock_response.status_code = 200 + mock_response.json.return_value = {"status": "OK"} + + with patch("requests.get", return_value=mock_response): + make_request_and_assert() + else: + make_request_and_assert() + + +def make_request_and_assert() -> Dict[str, Any]: + response = requests.get("http://localhost:8079/health") + if response.status_code != 200: + raise AssertionError( + f"Expected status code 200, but got {response.status_code}" + ) + + return response.json() diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json new file mode 100644 index 000000000..07d607f5f --- /dev/null +++ b/agbenchmark/challenges/code/d3/data.json @@ -0,0 +1,18 @@ +{ + "name": "TestCreateSimpleWebServer", + "category": ["code"], + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "ground": { + "answer": "GET localhost:8079/health responds with a 200 OK", + "should_contain": [], + "should_not_contain": [], + "files": [], + "type": "custom_python" + }, + "info": { + "difficulty": "medium", + "description": "Tests ability for the agent to build a simple web server locally", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index e7fe99e73..7dee0b2ab 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -2,6 +2,8 @@ import glob import importlib import json import os +import pkgutil +import sys import types from pathlib import Path from typing import Any, Dict @@ -47,6 +49,19 @@ def generate_tests() -> None: class_name = data.get("name", "") challenge_location = get_test_path(json_file) + if data["ground"]["type"] == "custom_python": + custom_python_location = ( + f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python" + ) + sys.path.append(str(custom_python_location)) + + for (module_loader, name, ispkg) in pkgutil.iter_modules( + [str(custom_python_location)] + ): + module = importlib.import_module(name) + + if hasattr(module, "make_assertion"): + make_assertion = getattr(module, "make_assertion") # Define test class dynamically challenge_class = types.new_class(class_name, (Challenge,)) @@ -58,7 +73,15 @@ def generate_tests() -> None: self.setup_challenge(config) scores = self.get_scores(config) - assert 1 in scores + + # Check if make_assertion is defined and use it + if "make_assertion" in locals(): + try: + make_assertion() + except AssertionError as error: + print(error) # Or handle this in another way + else: + assert 1 in scores # Parametrize the method here test_method = pytest.mark.parametrize( diff --git a/poetry.lock b/poetry.lock index 4eae340b6..5526da16b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -961,6 +961,33 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "types-requests" +version = "2.31.0.1" +description = "Typing stubs for requests" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "types-requests-2.31.0.1.tar.gz", hash = "sha256:3de667cffa123ce698591de0ad7db034a5317457a596eb0b4944e5a9d9e8d1ac"}, + {file = "types_requests-2.31.0.1-py3-none-any.whl", hash = "sha256:afb06ef8f25ba83d59a1d424bd7a5a939082f94b94e90ab5e6116bd2559deaa3"}, +] + +[package.dependencies] +types-urllib3 = "*" + +[[package]] +name = "types-urllib3" +version = "1.26.25.13" +description = "Typing stubs for urllib3" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "types-urllib3-1.26.25.13.tar.gz", hash = "sha256:3300538c9dc11dad32eae4827ac313f5d986b8b21494801f1bf97a1ac6c03ae5"}, + {file = "types_urllib3-1.26.25.13-py3-none-any.whl", hash = "sha256:5dbd1d2bef14efee43f5318b5d36d805a489f6600252bb53626d4bfafd95e27c"}, +] + [[package]] name = "typing-extensions" version = "4.7.1" @@ -1082,4 +1109,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "44b5789494e73f3cb8bcb9d25daa62143e59352a246fd7724fdb3ad58c2560ae" +content-hash = "81b84bbe08d4a09fb6a4f99c7fb018e0c0fcd879fa368c388b0af20c7c9a3f31" diff --git a/pyproject.toml b/pyproject.toml index a8f4f8dee..1a96a51de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ pydantic = "^1.10.9" pytest-depends = "^1.0.1" python-dotenv = "^0.21.0" click = "^8.1.3" +types-requests = "^2.31.0.1" [tool.poetry.group.dev.dependencies] flake8 = "^3.9.2" diff --git a/regression_tests.json b/regression_tests.json index 10a6e11bf..0cf2d5f30 100644 --- a/regression_tests.json +++ b/regression_tests.json @@ -1,69 +1,90 @@ { - "TestWriteFile": { + "TestBasicMemory": { "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark\\challenges\\interface\\write_file" + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/memory/m1" }, - "TestReadFile": { + "TestBasicRetrieval": { "difficulty": "basic", "dependencies": [ - "TestWriteFile" + "TestWriteFile", + "TestSearch" ], - "test": "agbenchmark\\challenges\\interface\\read_file" + "test": "agbenchmark/challenges/retrieval/r1" }, - "TestBasicMemory": { + "TestCreateSimpleWebServer": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/code/d3" + }, + "TestDebugSimpleTypoWithGuidance": { "difficulty": "basic", "dependencies": [ "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark\\challenges\\memory\\m1" + "test": "agbenchmark/challenges/code/d1" }, - "TestBasicRetrieval": { + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "medium", + "dependencies": [ + "TestDebugSimpleTypoWithGuidance" + ], + "test": "agbenchmark/challenges/code/d2" + }, + "TestReadFile": { "difficulty": "basic", "dependencies": [ "TestWriteFile" ], - "test": "agbenchmark\\challenges\\retrieval\\r1" + "test": "agbenchmark/challenges/interface/read_file" }, "TestRememberMultipleIds": { "difficulty": "basic", "dependencies": [ "TestBasicMemory" ], - "test": "agbenchmark\\challenges\\memory\\m2" + "test": "agbenchmark/challenges/memory/m2" }, - "TestRetrieval2": { - "difficulty": "basic", + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", "dependencies": [ - "TestBasicRetrieval" + "TestRememberMultipleIds" ], - "test": "agbenchmark\\challenges\\retrieval\\r2" + "test": "agbenchmark/challenges/memory/m3" }, - "TestRememberMultipleIdsWithNoise": { + "TestRememberMultiplePhrasesWithNoise": { "difficulty": "medium", "dependencies": [ - "TestRememberMultipleIds" + "TestRememberMultipleIdsWithNoise" ], - "test": "agbenchmark\\challenges\\memory\\m3" + "test": "agbenchmark/challenges/memory/m4" }, - "TestRetrieval3": { + "TestRetrieval2": { "difficulty": "basic", "dependencies": [ - "TestRetrieval2" + "TestBasicRetrieval" ], - "test": "agbenchmark\\challenges\\retrieval\\r3" + "test": "agbenchmark/challenges/retrieval/r2" }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", + "TestRetrieval3": { + "difficulty": "basic", "dependencies": [ - "TestRememberMultipleIdsWithNoise" + "TestRetrieval2" ], - "test": "agbenchmark\\challenges\\memory\\m4" + "test": "agbenchmark/challenges/retrieval/r3" }, "TestSearch": { "difficulty": "basic", "dependencies": [], - "test": "agbenchmark\\challenges\\interface\\search" + "test": "agbenchmark/challenges/interface/search" + }, + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/interface/write_file" } } \ No newline at end of file -- cgit v1.2.3 From 8df82909b2938424d387cdaa817821adcbee1dac Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 10 Jul 2023 22:25:19 -0400 Subject: Added --test, consolidate files, reports working (#83) --- agbenchmark/RegressionManager.py | 51 ---------- agbenchmark/ReportManager.py | 68 ++++++++++++++ agbenchmark/agent_interface.py | 14 ++- agbenchmark/challenges/define_task_types.py | 6 ++ agbenchmark/challenges/interface/search/data.json | 2 +- agbenchmark/challenges/test_all.py | 4 +- agbenchmark/config.json | 5 + agbenchmark/conftest.py | 19 +++- agbenchmark/regression_tests.json | 99 ++++++++++++++++++++ agbenchmark/reports/1.json | 109 ++++++++++++++++++++++ agbenchmark/start_benchmark.py | 59 ++++++++---- agbenchmark/utils.py | 16 ++++ agent/Auto-GPT | 2 +- agent/SuperAGI | 2 +- agent/config_example.json | 3 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- config.json | 6 -- mypy.ini | 2 +- regression_tests.json | 90 ------------------ 21 files changed, 375 insertions(+), 188 deletions(-) delete mode 100644 agbenchmark/RegressionManager.py create mode 100644 agbenchmark/ReportManager.py create mode 100644 agbenchmark/config.json create mode 100644 agbenchmark/regression_tests.json create mode 100644 agbenchmark/reports/1.json delete mode 100644 config.json delete mode 100644 regression_tests.json diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py deleted file mode 100644 index ac9efc696..000000000 --- a/agbenchmark/RegressionManager.py +++ /dev/null @@ -1,51 +0,0 @@ -import json -from typing import Union - - -class RegressionManager: - """Abstracts interaction with the regression tests file""" - - def __init__(self, filename: str): - self.filename = filename - self.load() - - def load(self) -> None: - try: - with open(self.filename, "r") as f: - file_content = ( - f.read().strip() - ) # read the content and remove any leading/trailing whitespace - if file_content: # if file is not empty, load the json - data = json.loads(file_content) - self.tests = {k: data[k] for k in sorted(data)} - data = self.replace_backslash(data) - else: # if file is empty, assign an empty dictionary - self.tests = {} - except FileNotFoundError: - self.tests = {} - except json.decoder.JSONDecodeError: # If JSON is invalid - self.tests = {} - self.save() - - def save(self) -> None: - with open(self.filename, "w") as f: - json.dump(self.tests, f, indent=4) - - def add_test(self, test_name: str, test_details: dict) -> None: - self.tests[test_name] = test_details - self.save() - - def remove_test(self, test_name: str) -> None: - if test_name in self.tests: - del self.tests[test_name] - self.save() - - def replace_backslash(self, value: str) -> Union[str, list[str], dict]: - if isinstance(value, str): - return value.replace("\\\\", "/") # escape \ with \\ - elif isinstance(value, list): - return [self.replace_backslash(i) for i in value] - elif isinstance(value, dict): - return {k: self.replace_backslash(v) for k, v in value.items()} - else: - return value diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py new file mode 100644 index 000000000..e6d8f62f6 --- /dev/null +++ b/agbenchmark/ReportManager.py @@ -0,0 +1,68 @@ +import json +import os +import sys +import time +from datetime import datetime +from typing import Any, Dict, Union + + +class ReportManager: + """Abstracts interaction with the regression tests file""" + + def __init__(self, filename: str): + self.filename = filename + self.start_time = time.time() + self.load() + + def load(self) -> None: + try: + with open(self.filename, "r") as f: + file_content = ( + f.read().strip() + ) # read the content and remove any leading/trailing whitespace + if file_content: # if file is not empty, load the json + data = json.loads(file_content) + self.tests = {k: data[k] for k in sorted(data)} + data = self.replace_backslash(data) + else: # if file is empty, assign an empty dictionary + self.tests = {} + except FileNotFoundError: + self.tests = {} + except json.decoder.JSONDecodeError: # If JSON is invalid + self.tests = {} + self.save() + + def save(self) -> None: + with open(self.filename, "w") as f: + json.dump(self.tests, f, indent=4) + + def add_test(self, test_name: str, test_details: dict) -> None: + self.tests[test_name] = test_details + self.save() + + def remove_test(self, test_name: str) -> None: + if test_name in self.tests: + del self.tests[test_name] + self.save() + + def end_info_report(self, config: Dict[str, Any]) -> None: + command = " ".join(sys.argv) + self.tests = { + "command": command.split(os.sep)[-1], + "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"), + "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds", + "tests": self.tests, + "config": config, + } + + self.save() + + def replace_backslash(self, value: str) -> Union[str, list[str], dict]: + if isinstance(value, str): + return value.replace("\\\\", "/") # escape \ with \\ + elif isinstance(value, list): + return [self.replace_backslash(i) for i in value] + elif isinstance(value, dict): + return {k: self.replace_backslash(v) for k, v in value.items()} + else: + return value diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 1d43577c7..d058ad4c2 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -3,6 +3,7 @@ import shutil import subprocess import sys import time +from pathlib import Path from typing import Any, Dict from dotenv import load_dotenv @@ -21,6 +22,7 @@ def run_agent( """Calling to get a response""" if MOCK_FLAG: + print("ITS A MOCK TEST", challenge_location) copy_artifacts_into_workspace( config["workspace"], "artifacts_out", challenge_location ) @@ -30,19 +32,13 @@ def run_agent( f"Running Python function '{config['entry_path']}' with timeout {timeout}" ) - # Get the current working directory - cwd = os.path.join(os.getcwd(), config["home_path"]) - - # Add current directory to Python's import path - sys.path.append(cwd) - command = [sys.executable, config["entry_path"], str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, - cwd=cwd, + cwd=os.getcwd(), ) start_time = time.time() @@ -79,7 +75,9 @@ def run_agent( def copy_artifacts_into_workspace( workspace: str, artifact_folder_name: str, challenge_dir_path: str ) -> None: - source_dir = os.path.join(challenge_dir_path, artifact_folder_name) + # this file is at agbenchmark\agent_interface.py + script_dir = Path(__file__).resolve().parent.parent + source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name) # Check if source_dir exists, if not then return immediately. if not os.path.exists(source_dir): diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 94cba5b72..f4e3f2220 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -1,4 +1,5 @@ import json +from pathlib import Path from typing import List, Optional from pydantic import BaseModel @@ -32,7 +33,12 @@ class ChallengeData(BaseModel): @staticmethod def deserialize(path: str) -> "ChallengeData": + # this script is in root/agbenchmark/challenges/define_task_types.py + script_dir = Path(__file__).resolve().parent.parent.parent + path = str(script_dir / path) + print("Deserializing", path) + with open(path, "r") as file: data = json.load(file) return ChallengeData(**data) diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json index 17ee1ac1a..f59b2dc9b 100644 --- a/agbenchmark/challenges/interface/search/data.json +++ b/agbenchmark/challenges/interface/search/data.json @@ -2,7 +2,7 @@ "name": "TestSearch", "category": ["interface"], "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", - "dependencies": [], + "dependencies": ["TestWriteFile"], "ground": { "answer": "This is a Heading\nThis is a paragraph.", "should_contain": ["Heading", "paragraph"], diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 7dee0b2ab..f8bb23471 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -19,7 +19,7 @@ load_dotenv() IMPROVE = os.getenv("IMPROVE", "False") -json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True) +json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True) def get_test_path(json_file: str) -> str: @@ -55,7 +55,7 @@ def generate_tests() -> None: ) sys.path.append(str(custom_python_location)) - for (module_loader, name, ispkg) in pkgutil.iter_modules( + for module_loader, name, ispkg in pkgutil.iter_modules( [str(custom_python_location)] ): module = importlib.import_module(name) diff --git a/agbenchmark/config.json b/agbenchmark/config.json new file mode 100644 index 000000000..9dd8b16ab --- /dev/null +++ b/agbenchmark/config.json @@ -0,0 +1,5 @@ +{ + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark/benchmarks.py", + "cutoff": 60 +} diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index e321f5a26..87fdc9c10 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -6,9 +6,10 @@ from typing import Any, Dict, Generator import pytest -from agbenchmark.RegressionManager import RegressionManager +from agbenchmark.ReportManager import ReportManager from agbenchmark.start_benchmark import ( CONFIG_PATH, + INFO_TESTS_PATH, REGRESSION_TESTS_PATH, get_regression_data, ) @@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None: return request.param -regression_manager = RegressionManager(REGRESSION_TESTS_PATH) +regression_manager = ReportManager(REGRESSION_TESTS_PATH) +info_manager = ReportManager(INFO_TESTS_PATH) def pytest_runtest_makereport(item: Any, call: Any) -> None: @@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: print("pytest_runtest_makereport", test_details) if call.excinfo is None: regression_manager.add_test(item.nodeid.split("::")[1], test_details) + test_details["success"] = True else: regression_manager.remove_test(item.nodeid.split("::")[1]) + test_details["success"] = False + test_details["fail_reason"] = str(call.excinfo.value) + info_manager.add_test(item.nodeid.split("::")[1], test_details) -def pytest_sessionfinish() -> None: - """Called at the end of the session to save regression tests""" + +def pytest_sessionfinish(session: Any) -> None: + """Called at the end of the session to save regression tests and info""" + with open(CONFIG_PATH, "r") as f: + config = json.load(f) + + info_manager.end_info_report(config) regression_manager.save() diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json new file mode 100644 index 000000000..68632a127 --- /dev/null +++ b/agbenchmark/regression_tests.json @@ -0,0 +1,99 @@ +{ + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/read_file", + "success": true + }, + "TestBasicMemory": { + "difficulty": "basic", + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/memory/m1", + "success": true + }, + "TestBasicRetrieval": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile", + "TestSearch" + ], + "test": "agbenchmark/challenges/retrieval/r1", + "success": true + }, + "TestRememberMultipleIds": { + "difficulty": "basic", + "dependencies": [ + "TestBasicMemory" + ], + "test": "agbenchmark/challenges/memory/m2", + "success": true + }, + "TestRetrieval2": { + "difficulty": "basic", + "dependencies": [ + "TestBasicRetrieval" + ], + "test": "agbenchmark/challenges/retrieval/r2", + "success": true + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIds" + ], + "test": "agbenchmark/challenges/memory/m3", + "success": true + }, + "TestRetrieval3": { + "difficulty": "basic", + "dependencies": [ + "TestRetrieval2" + ], + "test": "agbenchmark/challenges/retrieval/r3", + "success": true + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIdsWithNoise" + ], + "test": "agbenchmark/challenges/memory/m4", + "success": true + }, + "TestSearch": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/search", + "success": true + }, + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/interface/write_file", + "success": true + }, + "TestDebugSimpleTypoWithGuidance": { + "difficulty": "basic", + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/code/d1", + "success": true + }, + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "medium", + "dependencies": [ + "TestDebugSimpleTypoWithGuidance" + ], + "test": "agbenchmark/challenges/code/d2", + "success": true + } +} \ No newline at end of file diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json new file mode 100644 index 000000000..df07fb878 --- /dev/null +++ b/agbenchmark/reports/1.json @@ -0,0 +1,109 @@ +{ + "command": "agbenchmark start --mock", + "completion_time": "2023-07-10-21:19", + "time_elapsed": "8.75 seconds", + "tests": { + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "test": "agbenchmark/challenges/interface/write_file", + "success": true + }, + "TestReadFile": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/read_file", + "success": true + }, + "TestSearch": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile" + ], + "test": "agbenchmark/challenges/interface/search", + "success": true + }, + "TestDebugSimpleTypoWithGuidance": { + "difficulty": "basic", + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/code/d1", + "success": true + }, + "TestBasicMemory": { + "difficulty": "basic", + "dependencies": [ + "TestReadFile", + "TestWriteFile" + ], + "test": "agbenchmark/challenges/memory/m1", + "success": true + }, + "TestBasicRetrieval": { + "difficulty": "basic", + "dependencies": [ + "TestWriteFile", + "TestSearch" + ], + "test": "agbenchmark/challenges/retrieval/r1", + "success": true + }, + "TestDebugSimpleTypoWithoutGuidance": { + "difficulty": "medium", + "dependencies": [ + "TestDebugSimpleTypoWithGuidance" + ], + "test": "agbenchmark/challenges/code/d2", + "success": true + }, + "TestRememberMultipleIds": { + "difficulty": "basic", + "dependencies": [ + "TestBasicMemory" + ], + "test": "agbenchmark/challenges/memory/m2", + "success": true + }, + "TestRetrieval2": { + "difficulty": "basic", + "dependencies": [ + "TestBasicRetrieval" + ], + "test": "agbenchmark/challenges/retrieval/r2", + "success": true + }, + "TestRememberMultipleIdsWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIds" + ], + "test": "agbenchmark/challenges/memory/m3", + "success": true + }, + "TestRetrieval3": { + "difficulty": "basic", + "dependencies": [ + "TestRetrieval2" + ], + "test": "agbenchmark/challenges/retrieval/r3", + "success": true + }, + "TestRememberMultiplePhrasesWithNoise": { + "difficulty": "medium", + "dependencies": [ + "TestRememberMultipleIdsWithNoise" + ], + "test": "agbenchmark/challenges/memory/m4", + "success": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark/benchmarks.py", + "cutoff": 60 + } +} \ No newline at end of file diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 68c7932be..917cd4e8a 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -10,12 +10,16 @@ from dotenv import load_dotenv load_dotenv() +from agbenchmark.utils import calculate_info_test_path + CURRENT_DIRECTORY = Path(__file__).resolve().parent +benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark" -CONFIG_PATH = str(Path(os.getcwd()) / "config.json") +CONFIG_PATH = str(benchmarks_folder_path / "config.json") +REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json") -REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json") +INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path) @click.group() @@ -25,10 +29,11 @@ def cli() -> None: @cli.command() @click.option("--category", default=None, help="Specific category to run") +@click.option("--test", default=None, help="Specific test to run") @click.option("--maintain", is_flag=True, help="Runs only regression tests") @click.option("--improve", is_flag=True, help="Run only non-regression tests") @click.option("--mock", is_flag=True, help="Run with mock") -def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: +def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -> int: """Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" # Check if configuration file exists and is not empty if maintain and improve: @@ -37,6 +42,16 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: ) return 1 + if test and (category or maintain or improve): + print( + "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test." + ) + return 1 + + if not benchmarks_folder_path.exists(): + benchmarks_folder_path.mkdir(exist_ok=True) + + print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size) if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} @@ -46,12 +61,12 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: ) config["entry_path"] = click.prompt( - "Please enter a the path to your run_specific_agent function implementation", - default="/benchmarks.py", + "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder", + default="benchmarks.py", ) config["cutoff"] = click.prompt( - "Please enter a hard cutoff runtime for your agent", + "Please enter a hard cutoff runtime for your agent per test", default="60", ) @@ -65,7 +80,11 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: os.environ["MOCK_TEST"] = "True" if mock else "False" if not os.path.exists(REGRESSION_TESTS_PATH): - with open(REGRESSION_TESTS_PATH, "a"): + with open(REGRESSION_TESTS_PATH, "w"): + pass + + if not os.path.exists(INFO_TESTS_PATH): + with open(INFO_TESTS_PATH, "w"): pass print("Current configuration:") @@ -73,18 +92,22 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int: print(f"{key}: {value}") pytest_args = ["-vs"] - if category: - pytest_args.extend(["-m", category]) - print("Starting benchmark tests ", category) + if test: + print("Running specific test:", test) + pytest_args.extend(["-k", test]) else: - print("Running all categories") - - if maintain: - print("Running only regression tests") - pytest_args.append("--maintain") - elif improve: - print("Running only non-regression tests") - pytest_args.append("--improve") + if category: + pytest_args.extend(["-m", category]) + print("Running tests of category:", category) + else: + print("Running all categories") + + if maintain: + print("Running only regression tests") + pytest_args.append("--maintain") + elif improve: + print("Running only non-regression tests") + pytest_args.append("--improve") if mock: pytest_args.append("--mock") diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index b05a7ac31..ffde0c6d3 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1 +1,17 @@ # radio charts, logs, helper functions for tests, anything else relevant. +import glob +from pathlib import Path + + +def calculate_info_test_path(benchmarks_folder_path: Path) -> str: + INFO_TESTS_PATH = benchmarks_folder_path / "reports" + + if not INFO_TESTS_PATH.exists(): + INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True) + return str(INFO_TESTS_PATH / "1.json") + else: + json_files = glob.glob(str(INFO_TESTS_PATH / "*.json")) + file_count = len(json_files) + run_name = f"{file_count + 1}.json" + new_file_path = INFO_TESTS_PATH / run_name + return str(new_file_path) diff --git a/agent/Auto-GPT b/agent/Auto-GPT index f360d503b..dc2a76990 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit f360d503b113119f6b3ce0acff1dbb4dfae2223a +Subproject commit dc2a76990c75fafacbeaa76eb2e27d48de44cadd diff --git a/agent/SuperAGI b/agent/SuperAGI index 7ab2994d4..a28224d82 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916 +Subproject commit a28224d82572b598ccee1057086fabaf33e1aaa9 diff --git a/agent/config_example.json b/agent/config_example.json index ba2ec0b80..7ab65bc20 100644 --- a/agent/config_example.json +++ b/agent/config_example.json @@ -1,6 +1,5 @@ { "workspace": "projects/my-new-project/workspace", - "entry_path": "benchmarks.py", - "home_path": "", + "entry_path": "agbenchmark/benchmarks.py", "cutoff": 60 } diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 4af8c137e..cde9be3e7 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 4af8c137e82cc51fdd31c23327ceffd64194b984 +Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333 diff --git a/agent/mini-agi b/agent/mini-agi index 4af8a7e60..ad2b34505 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 4af8a7e6085f0518f06180fbf87024a2c9db4c88 +Subproject commit ad2b345050e07efb7ad0bde68c93bc2b4e2d7a92 diff --git a/agent/smol-developer b/agent/smol-developer index a1e4a9ff3..c52b14b1d 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit a1e4a9ff3a75909c4a892e409a55f86a2c57b7c6 +Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d diff --git a/config.json b/config.json deleted file mode 100644 index 8bbcebdbd..000000000 --- a/config.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "benchmarks.py", - "home_path": "agent/mini-agi", - "cutoff": 60 -} diff --git a/mypy.ini b/mypy.ini index 764c239f1..d35c6962d 100644 --- a/mypy.ini +++ b/mypy.ini @@ -15,5 +15,5 @@ ignore_errors = True [mypy-agbenchmark.mocks.tests.basic_mocks.*] ignore_errors = True -[mypy-agbenchmark.tests.regression.RegressionManager.*] +[mypy-agbenchmark.tests.regression.ReportManager.*] ignore_errors = True diff --git a/regression_tests.json b/regression_tests.json deleted file mode 100644 index 0cf2d5f30..000000000 --- a/regression_tests.json +++ /dev/null @@ -1,90 +0,0 @@ -{ - "TestBasicMemory": { - "difficulty": "basic", - "dependencies": [ - "TestReadFile", - "TestWriteFile" - ], - "test": "agbenchmark/challenges/memory/m1" - }, - "TestBasicRetrieval": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile", - "TestSearch" - ], - "test": "agbenchmark/challenges/retrieval/r1" - }, - "TestCreateSimpleWebServer": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/code/d3" - }, - "TestDebugSimpleTypoWithGuidance": { - "difficulty": "basic", - "dependencies": [ - "TestReadFile", - "TestWriteFile" - ], - "test": "agbenchmark/challenges/code/d1" - }, - "TestDebugSimpleTypoWithoutGuidance": { - "difficulty": "medium", - "dependencies": [ - "TestDebugSimpleTypoWithGuidance" - ], - "test": "agbenchmark/challenges/code/d2" - }, - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/read_file" - }, - "TestRememberMultipleIds": { - "difficulty": "basic", - "dependencies": [ - "TestBasicMemory" - ], - "test": "agbenchmark/challenges/memory/m2" - }, - "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIds" - ], - "test": "agbenchmark/challenges/memory/m3" - }, - "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIdsWithNoise" - ], - "test": "agbenchmark/challenges/memory/m4" - }, - "TestRetrieval2": { - "difficulty": "basic", - "dependencies": [ - "TestBasicRetrieval" - ], - "test": "agbenchmark/challenges/retrieval/r2" - }, - "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [ - "TestRetrieval2" - ], - "test": "agbenchmark/challenges/retrieval/r3" - }, - "TestSearch": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/search" - }, - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file" - } -} \ No newline at end of file -- cgit v1.2.3 From 0799be7e28bf4805e5cd2c9296c142b31f9501a4 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 10 Jul 2023 21:54:25 -0700 Subject: Fix tests ci (#82) --- .github/workflows/ci.yml | 7 +------ agbenchmark/agent_interface.py | 8 +++++--- agbenchmark/challenges/test_all.py | 32 ++++++++++++++------------------ agent/Auto-GPT | 2 +- agent/gpt-engineer | 2 +- agent/smol-developer | 2 +- poetry.lock | 29 ++++++++++++++++++++++++++++- pyproject.toml | 1 + 8 files changed, 52 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cac1dedb1..9df4173b2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,6 +75,7 @@ jobs: tests: name: ${{ matrix.agent-name }} runs-on: ubuntu-latest + timeout-minutes: 10 env: min-python-version: "3.10" strategy: @@ -156,16 +157,10 @@ jobs: if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then agbenchmark start --maintain else - exit 0 agbenchmark start --maintain --mock agbenchmark start --improve --mock agbenchmark start --mock agbenchmark start --mock --category=retrieval - agbenchmark start --mock --category=regression - agbenchmark start --mock --category=interface - agbenchmark start --mock --category=code - agbenchmark start --mock --category=memory - agbenchmark start --mock --category=memory --category=code fi env: GITHUB_EVENT_NAME: ${{ github.event_name }} diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index d058ad4c2..713451f01 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -3,11 +3,12 @@ import shutil import subprocess import sys import time -from pathlib import Path from typing import Any, Dict from dotenv import load_dotenv +from agbenchmark.start_benchmark import CURRENT_DIRECTORY + load_dotenv() mock_test_str = os.getenv("MOCK_TEST") @@ -76,8 +77,9 @@ def copy_artifacts_into_workspace( workspace: str, artifact_folder_name: str, challenge_dir_path: str ) -> None: # this file is at agbenchmark\agent_interface.py - script_dir = Path(__file__).resolve().parent.parent - source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name) + source_dir = os.path.join( + CURRENT_DIRECTORY, "..", challenge_dir_path, artifact_folder_name + ) # Check if source_dir exists, if not then return immediately. if not os.path.exists(source_dir): diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index f8bb23471..00a6ed635 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -49,19 +49,6 @@ def generate_tests() -> None: class_name = data.get("name", "") challenge_location = get_test_path(json_file) - if data["ground"]["type"] == "custom_python": - custom_python_location = ( - f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python" - ) - sys.path.append(str(custom_python_location)) - - for module_loader, name, ispkg in pkgutil.iter_modules( - [str(custom_python_location)] - ): - module = importlib.import_module(name) - - if hasattr(module, "make_assertion"): - make_assertion = getattr(module, "make_assertion") # Define test class dynamically challenge_class = types.new_class(class_name, (Challenge,)) @@ -75,11 +62,20 @@ def generate_tests() -> None: scores = self.get_scores(config) # Check if make_assertion is defined and use it - if "make_assertion" in locals(): - try: - make_assertion() - except AssertionError as error: - print(error) # Or handle this in another way + if self.data.ground.type == "custom_python": + custom_python_location = ( + f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python" + ) + sys.path.append(str(custom_python_location)) + + for (module_loader, name, ispkg) in pkgutil.iter_modules( + [str(custom_python_location)] + ): + module = importlib.import_module(name) + + if hasattr(module, "make_assertion"): + make_assertion = getattr(module, "make_assertion") + make_assertion() else: assert 1 in scores diff --git a/agent/Auto-GPT b/agent/Auto-GPT index dc2a76990..ade8e6f81 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit dc2a76990c75fafacbeaa76eb2e27d48de44cadd +Subproject commit ade8e6f8142a937160596a987ab96808b583f9e3 diff --git a/agent/gpt-engineer b/agent/gpt-engineer index cde9be3e7..538bcba6e 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333 +Subproject commit 538bcba6efbb7cda7f6a355a8c8420bbbdb52f25 diff --git a/agent/smol-developer b/agent/smol-developer index c52b14b1d..150981f77 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d +Subproject commit 150981f77f19777bf5aa76cb3a74869e4a8a8a05 diff --git a/poetry.lock b/poetry.lock index 5526da16b..ad72f5e10 100644 --- a/poetry.lock +++ b/poetry.lock @@ -729,6 +729,21 @@ files = [ {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, ] +[[package]] +name = "pexpect" +version = "4.8.0" +description = "Pexpect allows easy control of interactive console applications." +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, + {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + [[package]] name = "platformdirs" version = "3.8.0" @@ -761,6 +776,18 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + [[package]] name = "pycodestyle" version = "2.7.0" @@ -1109,4 +1136,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "81b84bbe08d4a09fb6a4f99c7fb018e0c0fcd879fa368c388b0af20c7c9a3f31" +content-hash = "09871e879785f0a7d5c31a61553cd2df08d88324a864b9c56b8e97d95893157f" diff --git a/pyproject.toml b/pyproject.toml index 1a96a51de..b0526ab57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ pytest-depends = "^1.0.1" python-dotenv = "^0.21.0" click = "^8.1.3" types-requests = "^2.31.0.1" +pexpect = "^4.8.0" [tool.poetry.group.dev.dependencies] flake8 = "^3.9.2" -- cgit v1.2.3 From 46f31cb643a4803c04f0a1cb5af8bde6afd0a90e Mon Sep 17 00:00:00 2001 From: Luke <2609441+lc0rp@users.noreply.github.com> Date: Tue, 11 Jul 2023 07:40:33 -0400 Subject: Bulletin & version update for 0.4.4 (#4937) Co-authored-by: Reinier van der Leer Co-authored-by: lc0rp <2609411+lc0rp@users.noreply.github.com> --- BULLETIN.md | 43 +++++++++++++++++++++++++------------------ pyproject.toml | 2 +- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/BULLETIN.md b/BULLETIN.md index 0b8afeba4..117a436a8 100644 --- a/BULLETIN.md +++ b/BULLETIN.md @@ -1,22 +1,29 @@ -# Website and Documentation Site 📰📖 -Check out *https://agpt.co*, the official news & updates site for Auto-GPT! -The documentation also has a place here, at *https://docs.agpt.co* +# QUICK LINKS 🔗 +# -------------- +🌎 *Official Website*: https://agpt.co. +📖 *User Guide*: https://docs.agpt.co. +👩 *Contributors Wiki*: https://github.com/Significant-Gravitas/Auto-GPT/wiki/Contributing. -# For contributors 👷🏼 -Since releasing v0.3.0, whave been working on re-architecting the Auto-GPT core to make it more extensible and make room for structural performance-oriented R&D. +# v0.4.4 RELEASE HIGHLIGHTS! 🚀 +# ----------------------------- +## GPT-4 is back! +Following OpenAI's recent GPT-4 GA announcement, the SMART_LLM .env setting +now defaults to GPT-4, and Auto-GPT will use GPT-4 by default in its main loop. -Check out the contribution guide on our wiki: -https://github.com/Significant-Gravitas/Auto-GPT/wiki/Contributing +### !! High Costs Warning !! 💰💀🚨 +GPT-4 costs ~20x more than GPT-3.5-turbo. +Please take note of this before using SMART_LLM. You can use `--gpt3only` +or `--gpt4only` to force the use of GPT-3.5-turbo or GPT-4, respectively, +at runtime. -# 🚀 v0.4.3 Release 🚀 -We're happy to announce the 0.4.3 maintenance release, which primarily focuses on refining the LLM command execution, -extending support for OpenAI's latest models (including the powerful GPT-3 16k model), and laying the groundwork -for future compatibility with OpenAI's function calling feature. +## Re-arch v1 preview release! +We've released a preview version of the re-arch code, under `autogpt/core`. +This is a major milestone for us, and we're excited to continue working on it. +We look forward to your feedback. Follow the process here: +https://github.com/Significant-Gravitas/Auto-GPT/issues/4770. -Key Highlights: -- OpenAI API Key Prompt: Auto-GPT will now courteously prompt users for their OpenAI API key, if it's not already provided. -- Summarization Enhancements: We've optimized Auto-GPT's use of the LLM context window even further. -- JSON Memory Reading: Support for reading memories from JSON files has been improved, resulting in enhanced task execution. -- Deprecated commands, removed for a leaner, more performant LLM: analyze_code, write_tests, improve_code, audio_text, web_playwright, web_requests -## Take a look at the Release Notes on Github for the full changelog! -https://github.com/Significant-Gravitas/Auto-GPT/releases +## Other highlights +Other fixes include plugins regressions, Azure config and security patches. + +Take a look at the Release Notes on Github for the full changelog! +https://github.com/Significant-Gravitas/Auto-GPT/releases. diff --git a/pyproject.toml b/pyproject.toml index b0aea625c..06b2f87f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agpt" -version = "0.4.3" +version = "0.4.4" authors = [ { name="Torantulino", email="support@agpt.co" }, ] -- cgit v1.2.3 From 22295350a63cad4ab0be2af83e68cc8e106b7201 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 11 Jul 2023 09:57:53 -0700 Subject: All Agents log to helicone automatically (#85) Signed-off-by: Merwane Hamadi Co-authored-by: Justin --- .github/workflows/ci.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9df4173b2..4d52dd027 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,17 +2,16 @@ name: CI on: workflow_dispatch: - branches: [ master ] + branches: [master] schedule: - - cron: '0 8 * * *' + - cron: "0 8 * * *" push: - branches: [ master, ci-test* ] + branches: [master, ci-test*] pull_request: - branches: [ stable, master, release-* ] + branches: [stable, master, release-*] jobs: lint: - runs-on: ubuntu-latest env: min-python-version: "3.10" @@ -83,10 +82,9 @@ jobs: matrix: agent-name: - "gpt-engineer" + - "smol-developer" - "Auto-GPT" - "mini-agi" - - "smol-developer" - steps: - name: Checkout repository uses: actions/checkout@v3 @@ -151,10 +149,11 @@ jobs: echo "Unknown agent name: $AGENT_NAME" exit 1 fi - + pip install ../../dist/*.whl if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then + curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start agbenchmark start --maintain else agbenchmark start --maintain --mock @@ -168,6 +167,8 @@ jobs: AGENT_NAME: ${{ matrix.agent-name }} PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards. HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }} + REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt + HELICONE_CACHE_ENABLED: false - name: Upload logs as artifact if: always() -- cgit v1.2.3 From 4ecb70c5e3e2fbf63780ba983cc1e96eea251541 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 11 Jul 2023 12:11:24 -0700 Subject: Fix Auto-GPT integration by adding python module as entrypoint (#86) Co-authored-by: Silen Naihin --- agbenchmark/agent_interface.py | 3 +-- agbenchmark/config.json | 2 +- agent/Auto-GPT | 2 +- agent/SuperAGI | 2 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- 7 files changed, 7 insertions(+), 8 deletions(-) diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 713451f01..c737f3079 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -32,8 +32,7 @@ def run_agent( print( f"Running Python function '{config['entry_path']}' with timeout {timeout}" ) - - command = [sys.executable, config["entry_path"], str(task)] + command = [sys.executable, "-m", config["entry_path"], str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 9dd8b16ab..af83029ef 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,5 @@ { "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark/benchmarks.py", + "entry_path": "agbenchmark.benchmarks", "cutoff": 60 } diff --git a/agent/Auto-GPT b/agent/Auto-GPT index ade8e6f81..e5fbe4313 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit ade8e6f8142a937160596a987ab96808b583f9e3 +Subproject commit e5fbe4313e0ebf7f75514a181a5d2044a7babd26 diff --git a/agent/SuperAGI b/agent/SuperAGI index a28224d82..928051291 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit a28224d82572b598ccee1057086fabaf33e1aaa9 +Subproject commit 9280512910c74bc33333e2ce7c48e47021227529 diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 538bcba6e..42400fd67 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 538bcba6efbb7cda7f6a355a8c8420bbbdb52f25 +Subproject commit 42400fd67972278e454621e7abf450a4f899a44a diff --git a/agent/mini-agi b/agent/mini-agi index ad2b34505..6a1d08880 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit ad2b345050e07efb7ad0bde68c93bc2b4e2d7a92 +Subproject commit 6a1d08880c65fe3e5831243c1e1ea19acf85516c diff --git a/agent/smol-developer b/agent/smol-developer index 150981f77..a0e9f4f39 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit 150981f77f19777bf5aa76cb3a74869e4a8a8a05 +Subproject commit a0e9f4f39e26a56b13a364be09fc58d2d85150ea -- cgit v1.2.3 From b3c506cd943f82f65720c116a770d062a37e0982 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 11 Jul 2023 17:02:29 -0700 Subject: Fix Auto-GPT looping forever (#87) --- agbenchmark/agent_interface.py | 50 +++++++++++++++++++++++++----------------- agent/Auto-GPT | 2 +- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index c737f3079..a1a79ada0 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -42,35 +42,45 @@ def run_agent( ) start_time = time.time() - timeout = config["cutoff"] - while True: - if process.stdout is None: - continue + print( + f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}" + ) + command = [sys.executable, "-m", config["entry_path"], str(task)] + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True, + ) - while output := process.stdout.readline(): - print(output.strip()) + start_time = time.time() - # Check if process has ended - if process.poll() is not None: - print("The Python function has finished running.") - break + while True: + output = "" + if process.stdout is not None: + output = process.stdout.readline() + print(output.strip()) - # Check if process has exceeded timeout - if time.time() - start_time > timeout: - print( - "The Python function has exceeded the time limit and was terminated." - ) - # Terminate the process group - process.terminate() + # Check if process has ended, has no more output, or exceeded timeout + if ( + process.poll() is not None + or output == "" + or (time.time() - start_time > config["cutoff"]) + ): break - # Optional: sleep for a while - time.sleep(0.1) + if time.time() - start_time > config["cutoff"]: + print("The Python function has exceeded the time limit and was terminated.") + process.kill() + else: + print("The Python function has finished running.") - # Wait for process to terminate, then get return code process.wait() + if process.returncode != 0: + print(f"The agent timed out") + def copy_artifacts_into_workspace( workspace: str, artifact_folder_name: str, challenge_dir_path: str diff --git a/agent/Auto-GPT b/agent/Auto-GPT index e5fbe4313..d4fc134f8 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit e5fbe4313e0ebf7f75514a181a5d2044a7babd26 +Subproject commit d4fc134f8c4bd7b63f283f932f68932317f53f78 -- cgit v1.2.3 From 504634b4a6d9a1bb327b026694f2bf1692226bee Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 11 Jul 2023 20:50:56 -0700 Subject: Add custom properties to Helicone (#91) --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4d52dd027..50adac76d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -169,6 +169,7 @@ jobs: HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }} REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt HELICONE_CACHE_ENABLED: false + HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} - name: Upload logs as artifact if: always() -- cgit v1.2.3 From e292ffebaff80d9eaeaea6c5c8600a5d53361e5f Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 11 Jul 2023 21:37:49 -0700 Subject: Enable cache (#92) --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 50adac76d..45bd64fff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -168,7 +168,7 @@ jobs: PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards. HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }} REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt - HELICONE_CACHE_ENABLED: false + HELICONE_CACHE_ENABLED: true HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} - name: Upload logs as artifact -- cgit v1.2.3 From 8d0c5179ed94fcf673403293c4664be4da542333 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Wed, 12 Jul 2023 01:37:59 -0400 Subject: fixing backslashes, adding basic metrics (#89) --- agbenchmark/ReportManager.py | 23 +-- agbenchmark/agent_interface.py | 16 -- agbenchmark/challenges/code/d1/data.json | 2 +- agbenchmark/challenges/code/d2/data.json | 4 +- agbenchmark/challenges/code/d3/data.json | 2 +- agbenchmark/challenges/define_task_types.py | 41 ++++- .../challenges/interface/read_file/data.json | 2 +- agbenchmark/challenges/interface/search/data.json | 2 +- .../challenges/interface/write_file/data.json | 2 +- agbenchmark/challenges/memory/m1/data.json | 6 +- agbenchmark/challenges/memory/m2/data.json | 4 +- agbenchmark/challenges/memory/m3/data.json | 4 +- agbenchmark/challenges/memory/m4/data.json | 4 +- agbenchmark/challenges/retrieval/r1/data.json | 2 +- agbenchmark/challenges/retrieval/r2/data.json | 2 +- agbenchmark/challenges/retrieval/r3/data.json | 2 +- agbenchmark/challenges/test_all.py | 15 +- agbenchmark/conftest.py | 83 ++++++++- agbenchmark/internal_info.json | 67 ++++++++ agbenchmark/regression_tests.json | 75 +++----- agbenchmark/reports/1.json | 191 +++++++++++++-------- agbenchmark/utils.py | 55 ++++++ agent/SuperAGI | 2 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- 26 files changed, 412 insertions(+), 200 deletions(-) create mode 100644 agbenchmark/internal_info.json diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py index e6d8f62f6..cae13595a 100644 --- a/agbenchmark/ReportManager.py +++ b/agbenchmark/ReportManager.py @@ -3,7 +3,9 @@ import os import sys import time from datetime import datetime -from typing import Any, Dict, Union +from typing import Any, Dict + +from agbenchmark.utils import get_highest_success_difficulty class ReportManager: @@ -23,7 +25,6 @@ class ReportManager: if file_content: # if file is not empty, load the json data = json.loads(file_content) self.tests = {k: data[k] for k in sorted(data)} - data = self.replace_backslash(data) else: # if file is empty, assign an empty dictionary self.tests = {} except FileNotFoundError: @@ -36,8 +37,9 @@ class ReportManager: with open(self.filename, "w") as f: json.dump(self.tests, f, indent=4) - def add_test(self, test_name: str, test_details: dict) -> None: + def add_test(self, test_name: str, test_details: dict | list) -> None: self.tests[test_name] = test_details + self.save() def remove_test(self, test_name: str) -> None: @@ -50,19 +52,12 @@ class ReportManager: self.tests = { "command": command.split(os.sep)[-1], "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"), - "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds", + "metrics": { + "run_time": str(round(time.time() - self.start_time, 2)) + " seconds", + "highest_difficulty": get_highest_success_difficulty(self.tests), + }, "tests": self.tests, "config": config, } self.save() - - def replace_backslash(self, value: str) -> Union[str, list[str], dict]: - if isinstance(value, str): - return value.replace("\\\\", "/") # escape \ with \\ - elif isinstance(value, list): - return [self.replace_backslash(i) for i in value] - elif isinstance(value, dict): - return {k: self.replace_backslash(v) for k, v in value.items()} - else: - return value diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index a1a79ada0..991a7e8e0 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -23,26 +23,10 @@ def run_agent( """Calling to get a response""" if MOCK_FLAG: - print("ITS A MOCK TEST", challenge_location) copy_artifacts_into_workspace( config["workspace"], "artifacts_out", challenge_location ) else: - timeout = config["cutoff"] - print( - f"Running Python function '{config['entry_path']}' with timeout {timeout}" - ) - command = [sys.executable, "-m", config["entry_path"], str(task)] - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - universal_newlines=True, - cwd=os.getcwd(), - ) - - start_time = time.time() - print( f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}" ) diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index 0c7246000..061c924f5 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -13,6 +13,6 @@ "info": { "difficulty": "basic", "description": "Tests ability for the agent to debug python code with a simple typo in it.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index 292301094..6523ef1d8 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -11,8 +11,8 @@ "type": "execute_python_code" }, "info": { - "difficulty": "medium", + "difficulty": "novice", "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json index 07d607f5f..94c81664c 100644 --- a/agbenchmark/challenges/code/d3/data.json +++ b/agbenchmark/challenges/code/d3/data.json @@ -11,7 +11,7 @@ "type": "custom_python" }, "info": { - "difficulty": "medium", + "difficulty": "advanced", "description": "Tests ability for the agent to build a simple web server locally", "side_effects": [] } diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index f4e3f2220..668025dd2 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -1,15 +1,52 @@ import json +from enum import Enum from pathlib import Path from typing import List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, validator + + +class DifficultyLevel(Enum): + interface = "interface" + basic = "basic" + novice = "novice" + intermediate = "intermediate" + advanced = "advanced" + expert = "expert" + human = "human" + + +# map from enum to difficulty level (numeric) +DIFFICULTY_MAP = { + DifficultyLevel.interface: 1, + DifficultyLevel.basic: 2, + DifficultyLevel.novice: 3, + DifficultyLevel.intermediate: 4, + DifficultyLevel.advanced: 5, + DifficultyLevel.expert: 6, + DifficultyLevel.human: 7, +} class Info(BaseModel): - difficulty: str + difficulty: DifficultyLevel description: str side_effects: List[str] + @validator("difficulty", pre=True) + def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel: + """Convert a string to an instance of DifficultyLevel.""" + if isinstance(v, DifficultyLevel): + return v + + if isinstance(v, str): + try: + return DifficultyLevel(v.lower()) + except ValueError: + pass + + raise ValueError(f"Cannot convert {v} to DifficultyLevel.") + class Ground(BaseModel): answer: str diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json index c827581b6..1bf340454 100644 --- a/agbenchmark/challenges/interface/read_file/data.json +++ b/agbenchmark/challenges/interface/read_file/data.json @@ -11,7 +11,7 @@ }, "info": { "description": "This reads the file quickly", - "difficulty": "basic", + "difficulty": "interface", "side_effects": [""] } } diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json index f59b2dc9b..de8934d95 100644 --- a/agbenchmark/challenges/interface/search/data.json +++ b/agbenchmark/challenges/interface/search/data.json @@ -11,7 +11,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "interface", "description": "Tests if an llm can search", "side_effects": [""] } diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json index 2be2d0dfe..8db9cd620 100644 --- a/agbenchmark/challenges/interface/write_file/data.json +++ b/agbenchmark/challenges/interface/write_file/data.json @@ -11,7 +11,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "interface", "description": "Tests the writing to file", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index 506b246ad..ab86f1c3c 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -10,13 +10,9 @@ "files": [".txt"], "type": "file" }, - "mock": { - "mock_func": "basic_memory_mock", - "mock_task": "Follow the instructions in the instructions_1.txt file" - }, "info": { "difficulty": "basic", "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 7ef2552d1..9205c99f1 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -11,8 +11,8 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "novice", "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index 720cce93c..3b78d614b 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -11,8 +11,8 @@ "type": "file" }, "info": { - "difficulty": "medium", + "difficulty": "intermediate", "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index 61965206b..84f5c2b21 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -16,8 +16,8 @@ "type": "file" }, "info": { - "difficulty": "medium", + "difficulty": "advanced", "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index 4f3833dfc..e3e09302d 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -13,6 +13,6 @@ "info": { "difficulty": "basic", "description": "Tests ability to retrieve information from a website.", - "side_effects": ["tests if there is in fact an LLM attached"] + "side_effects": [] } } diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json index 5bc2e96b4..977be4bcd 100644 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -11,7 +11,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "novice", "description": "Tests ability to retrieve information.", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index b918d3d4e..5504908ea 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -27,7 +27,7 @@ "type": "file" }, "info": { - "difficulty": "basic", + "difficulty": "intermediate", "description": "Tests ability to retrieve information.", "side_effects": ["tests if there is in fact an LLM attached"] } diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 00a6ed635..a5afef96c 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -9,15 +9,10 @@ from pathlib import Path from typing import Any, Dict import pytest -from dotenv import load_dotenv from agbenchmark.challenge import Challenge from agbenchmark.start_benchmark import CURRENT_DIRECTORY - -load_dotenv() - -IMPROVE = os.getenv("IMPROVE", "False") - +from agbenchmark.utils import replace_backslash json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True) @@ -36,7 +31,11 @@ def get_test_path(json_file: str) -> str: # Create the path from "agbenchmark" onwards challenge_location = Path(*path.parts[agbenchmark_index:]) - return str(challenge_location) + formatted_location = replace_backslash(str(challenge_location)) + if isinstance(formatted_location, str): + return formatted_location + else: + return str(challenge_location) def generate_tests() -> None: @@ -68,7 +67,7 @@ def generate_tests() -> None: ) sys.path.append(str(custom_python_location)) - for (module_loader, name, ispkg) in pkgutil.iter_modules( + for module_loader, name, ispkg in pkgutil.iter_modules( [str(custom_python_location)] ): module = importlib.import_module(name) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 87fdc9c10..b91b5f9f8 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -1,6 +1,8 @@ import json import os import shutil +import sys +import time from pathlib import Path # noqa from typing import Any, Dict, Generator @@ -13,6 +15,7 @@ from agbenchmark.start_benchmark import ( REGRESSION_TESTS_PATH, get_regression_data, ) +from agbenchmark.utils import calculate_success_percentage def resolve_workspace(config: Dict[str, Any]) -> str: @@ -107,9 +110,29 @@ def challenge_data(request: Any) -> None: return request.param +@pytest.fixture(autouse=True, scope="session") +def mock(request: Any) -> None: + return request.config.getoption("--mock") + + +@pytest.fixture(autouse=True, scope="function") +def timer(request: Any) -> Any: + start_time = time.time() + yield + run_time = time.time() - start_time + request.node.user_properties.append(("run_time", run_time)) + + +# tests that consistently pass are considered regression tests regression_manager = ReportManager(REGRESSION_TESTS_PATH) + +# user facing reporting information info_manager = ReportManager(INFO_TESTS_PATH) +INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py +# internal db step in replacement track pass/fail rate +internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json")) + def pytest_runtest_makereport(item: Any, call: Any) -> None: if call.when == "call": @@ -122,23 +145,66 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: ) # Extract the challenge_location from the class challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "") + test_name = item.nodeid.split("::")[1] + item.test_name = test_name test_details = { "difficulty": difficulty, "dependencies": dependencies, - "test": challenge_location, + "data_path": challenge_location, + } + + info_details: Any = { + "data_path": challenge_location, + "is_regression": False, + "metrics": { + "difficulty": difficulty, + "success": False, + }, } - print("pytest_runtest_makereport", test_details) + mock = "--mock" in sys.argv # Check if --mock is in sys.argv + if call.excinfo is None: - regression_manager.add_test(item.nodeid.split("::")[1], test_details) - test_details["success"] = True + info_details["metrics"]["success"] = True else: - regression_manager.remove_test(item.nodeid.split("::")[1]) - test_details["success"] = False - test_details["fail_reason"] = str(call.excinfo.value) + if not mock: # don't remove if it's a mock test + regression_manager.remove_test(test_name) + info_details["metrics"]["fail_reason"] = str(call.excinfo.value) + + prev_test_results: list[bool] = [] + + if not mock: + # only add if it's an actual test + prev_test_results = internal_info.tests.get(test_name, []) + prev_test_results.append(info_details["metrics"]["success"]) + internal_info.add_test(test_name, prev_test_results) + + # can calculate success rate regardless of mock + info_details["metrics"]["success_%"] = calculate_success_percentage( + prev_test_results + ) + + if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]: + # if the last 3 tests were successful, add to the regression tests + info_details["is_regression"] = True + regression_manager.add_test(test_name, test_details) + + # user facing reporting + item.info_details = info_details + if call.when == "teardown": + run_time = dict(item.user_properties).get("run_time") + + info_details = getattr(item, "info_details", {}) + test_name = getattr(item, "test_name", "") + + if info_details and test_name: + if run_time: + info_details["metrics"][ + "run_time" + ] = f"{str(round(run_time, 3))} seconds" - info_manager.add_test(item.nodeid.split("::")[1], test_details) + info_manager.add_test(test_name, info_details) def pytest_sessionfinish(session: Any) -> None: @@ -146,6 +212,7 @@ def pytest_sessionfinish(session: Any) -> None: with open(CONFIG_PATH, "r") as f: config = json.load(f) + internal_info.save() info_manager.end_info_report(config) regression_manager.save() diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json new file mode 100644 index 000000000..5f46bd854 --- /dev/null +++ b/agbenchmark/internal_info.json @@ -0,0 +1,67 @@ +{ + "TestBasicMemory": [ + true, + true, + true + ], + "TestBasicRetrieval": [ + true, + true, + true + ], + "TestCreateSimpleWebServer": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithoutGuidance": [ + false, + false, + false + ], + "TestReadFile": [ + true, + true, + true + ], + "TestRememberMultipleIds": [ + true, + true, + true + ], + "TestRememberMultipleIdsWithNoise": [ + true, + true, + true + ], + "TestRememberMultiplePhrasesWithNoise": [ + true, + true, + true + ], + "TestRetrieval2": [ + true, + true, + true + ], + "TestRetrieval3": [ + true, + true, + true + ], + "TestSearch": [ + true, + true, + true + ], + "TestWriteFile": [ + true, + true, + true + ] +} \ No newline at end of file diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json index 68632a127..ce73ce263 100644 --- a/agbenchmark/regression_tests.json +++ b/agbenchmark/regression_tests.json @@ -1,20 +1,11 @@ { - "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/read_file", - "success": true - }, "TestBasicMemory": { "difficulty": "basic", "dependencies": [ "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m1", - "success": true + "data_path": "agbenchmark/challenges/memory/m1" }, "TestBasicRetrieval": { "difficulty": "basic", @@ -22,78 +13,60 @@ "TestWriteFile", "TestSearch" ], - "test": "agbenchmark/challenges/retrieval/r1", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r1" }, - "TestRememberMultipleIds": { + "TestReadFile": { "difficulty": "basic", "dependencies": [ - "TestBasicMemory" + "TestWriteFile" ], - "test": "agbenchmark/challenges/memory/m2", - "success": true + "data_path": "agbenchmark/challenges/interface/read_file" }, - "TestRetrieval2": { + "TestRememberMultipleIds": { "difficulty": "basic", "dependencies": [ - "TestBasicRetrieval" + "TestBasicMemory" ], - "test": "agbenchmark/challenges/retrieval/r2", - "success": true + "data_path": "agbenchmark/challenges/memory/m2" }, "TestRememberMultipleIdsWithNoise": { "difficulty": "medium", "dependencies": [ "TestRememberMultipleIds" ], - "test": "agbenchmark/challenges/memory/m3", - "success": true - }, - "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [ - "TestRetrieval2" - ], - "test": "agbenchmark/challenges/retrieval/r3", - "success": true + "data_path": "agbenchmark/challenges/memory/m3" }, "TestRememberMultiplePhrasesWithNoise": { "difficulty": "medium", "dependencies": [ "TestRememberMultipleIdsWithNoise" ], - "test": "agbenchmark/challenges/memory/m4", - "success": true + "data_path": "agbenchmark/challenges/memory/m4" }, - "TestSearch": { + "TestRetrieval2": { "difficulty": "basic", "dependencies": [ - "TestWriteFile" + "TestBasicRetrieval" ], - "test": "agbenchmark/challenges/interface/search", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r2" }, - "TestWriteFile": { + "TestRetrieval3": { "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file", - "success": true + "dependencies": [ + "TestRetrieval2" + ], + "data_path": "agbenchmark/challenges/retrieval/r3" }, - "TestDebugSimpleTypoWithGuidance": { + "TestSearch": { "difficulty": "basic", "dependencies": [ - "TestReadFile", "TestWriteFile" ], - "test": "agbenchmark/challenges/code/d1", - "success": true + "data_path": "agbenchmark/challenges/interface/search" }, - "TestDebugSimpleTypoWithoutGuidance": { - "difficulty": "medium", - "dependencies": [ - "TestDebugSimpleTypoWithGuidance" - ], - "test": "agbenchmark/challenges/code/d2", - "success": true + "TestWriteFile": { + "difficulty": "basic", + "dependencies": [], + "data_path": "agbenchmark/challenges/interface/write_file" } } \ No newline at end of file diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json index df07fb878..45945a3ee 100644 --- a/agbenchmark/reports/1.json +++ b/agbenchmark/reports/1.json @@ -1,109 +1,148 @@ { "command": "agbenchmark start --mock", - "completion_time": "2023-07-10-21:19", - "time_elapsed": "8.75 seconds", + "completion_time": "2023-07-11-21:09", + "metrics": { + "run_time": "0.96 seconds", + "highest_difficulty": "advanced: 5" + }, "tests": { "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "test": "agbenchmark/challenges/interface/write_file", - "success": true + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 0, + "run_time": "0.008 seconds" + } }, "TestReadFile": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/read_file", - "success": true + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 0, + "run_time": "0.005 seconds" + } }, "TestSearch": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile" - ], - "test": "agbenchmark/challenges/interface/search", - "success": true + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 0, + "run_time": "0.006 seconds" + } }, "TestDebugSimpleTypoWithGuidance": { - "difficulty": "basic", - "dependencies": [ - "TestReadFile", - "TestWriteFile" - ], - "test": "agbenchmark/challenges/code/d1", - "success": true + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0, + "run_time": "0.489 seconds" + } }, "TestBasicMemory": { - "difficulty": "basic", - "dependencies": [ - "TestReadFile", - "TestWriteFile" - ], - "test": "agbenchmark/challenges/memory/m1", - "success": true + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 0, + "run_time": "0.02 seconds" + } }, "TestBasicRetrieval": { - "difficulty": "basic", - "dependencies": [ - "TestWriteFile", - "TestSearch" - ], - "test": "agbenchmark/challenges/retrieval/r1", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 0, + "run_time": "0.01 seconds" + } }, "TestDebugSimpleTypoWithoutGuidance": { - "difficulty": "medium", - "dependencies": [ - "TestDebugSimpleTypoWithGuidance" - ], - "test": "agbenchmark/challenges/code/d2", - "success": true + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0, + "run_time": "0.001 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0, + "run_time": "0.001 seconds" + } }, "TestRememberMultipleIds": { - "difficulty": "basic", - "dependencies": [ - "TestBasicMemory" - ], - "test": "agbenchmark/challenges/memory/m2", - "success": true + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 0, + "run_time": "0.018 seconds" + } }, "TestRetrieval2": { - "difficulty": "basic", - "dependencies": [ - "TestBasicRetrieval" - ], - "test": "agbenchmark/challenges/retrieval/r2", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 0, + "run_time": "0.009 seconds" + } }, "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIds" - ], - "test": "agbenchmark/challenges/memory/m3", - "success": true + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": true, + "success_%": 0, + "run_time": "0.022 seconds" + } }, "TestRetrieval3": { - "difficulty": "basic", - "dependencies": [ - "TestRetrieval2" - ], - "test": "agbenchmark/challenges/retrieval/r3", - "success": true + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": true, + "success_%": 0, + "run_time": "0.01 seconds" + } }, "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", - "dependencies": [ - "TestRememberMultipleIdsWithNoise" - ], - "test": "agbenchmark/challenges/memory/m4", - "success": true + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 0, + "run_time": "0.021 seconds" + } } }, "config": { "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark/benchmarks.py", + "entry_path": "agbenchmark.benchmarks", "cutoff": 60 } } \ No newline at end of file diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index ffde0c6d3..598113d3d 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1,6 +1,10 @@ # radio charts, logs, helper functions for tests, anything else relevant. import glob +import re from pathlib import Path +from typing import Any + +from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel def calculate_info_test_path(benchmarks_folder_path: Path) -> str: @@ -15,3 +19,54 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str: run_name = f"{file_count + 1}.json" new_file_path = INFO_TESTS_PATH / run_name return str(new_file_path) + + +def replace_backslash(value: Any) -> Any: + if isinstance(value, str): + return re.sub( + r"\\+", "/", value + ) # replace one or more backslashes with a forward slash + elif isinstance(value, list): + return [replace_backslash(i) for i in value] + elif isinstance(value, dict): + return {k: replace_backslash(v) for k, v in value.items()} + else: + return value + + +def calculate_success_percentage(results: list[bool]) -> float: + success_count = results.count(True) + total_count = len(results) + if total_count == 0: + return 0 + success_percentage = (success_count / total_count) * 100 # as a percentage + return round(success_percentage, 2) + + +def get_highest_success_difficulty(data: dict) -> str: + highest_difficulty = None + highest_difficulty_level = -1 + + for test_name, test_data in data.items(): + if test_data["metrics"]["success"]: + # Replace 'medium' with 'intermediate' for this example + difficulty_str = test_data["metrics"]["difficulty"] + + try: + difficulty_enum = DifficultyLevel[difficulty_str.lower()] + difficulty_level = DIFFICULTY_MAP[difficulty_enum] + + if difficulty_level > highest_difficulty_level: + highest_difficulty = difficulty_enum + highest_difficulty_level = difficulty_level + except KeyError: + print( + f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'" + ) + + if highest_difficulty is not None: + highest_difficulty_str = highest_difficulty.name # convert enum to string + else: + highest_difficulty_str = "" + + return f"{highest_difficulty_str}: {highest_difficulty_level}" diff --git a/agent/SuperAGI b/agent/SuperAGI index 928051291..bd4b3def6 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit 9280512910c74bc33333e2ce7c48e47021227529 +Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007 diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 42400fd67..cde9be3e7 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 42400fd67972278e454621e7abf450a4f899a44a +Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333 diff --git a/agent/mini-agi b/agent/mini-agi index 6a1d08880..08764876d 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 6a1d08880c65fe3e5831243c1e1ea19acf85516c +Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0 diff --git a/agent/smol-developer b/agent/smol-developer index a0e9f4f39..c52b14b1d 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit a0e9f4f39e26a56b13a364be09fc58d2d85150ea +Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d -- cgit v1.2.3 From b00570f6d9e5ddce1812c0014e7593ea15033736 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Wed, 12 Jul 2023 15:04:24 +0200 Subject: Fix CI warnings --- .github/workflows/benchmarks.yml | 4 ++-- .github/workflows/ci.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index e40abf2f6..195ebeffd 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -27,8 +27,8 @@ jobs: with: ref: master - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + - name: Set up Python ${{ matrix.config.python-version }} + uses: actions/setup-python@v4 with: python-version: ${{ matrix.config.python-version }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 79ed16e87..a3d982137 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,7 +33,7 @@ jobs: repository: ${{ github.event.pull_request.head.repo.full_name }} - name: Set up Python ${{ env.min-python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.min-python-version }} @@ -132,7 +132,7 @@ jobs: fi - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} -- cgit v1.2.3 From 21c0cdcb76f4da12ef413bfc59d171f4f49260d8 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Wed, 12 Jul 2023 17:50:18 +0200 Subject: Disable proxy for internal pull requests (#4953) --- .github/workflows/ci.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a3d982137..109d2d5c1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -159,9 +159,10 @@ jobs: python tests/challenges/utils/build_current_score.py env: CI: true - PROXY: ${{ secrets.PROXY }} - AGENT_MODE: ${{ secrets.AGENT_MODE }} - AGENT_TYPE: ${{ secrets.AGENT_TYPE }} + PROXY: ${{ github.event_name == 'pull_request_target' && secrets.PROXY || '' }} + AGENT_MODE: ${{ github.event_name == 'pull_request_target' && secrets.AGENT_MODE || '' }} + AGENT_TYPE: ${{ github.event_name == 'pull_request_target' && secrets.AGENT_TYPE || '' }} + OPENAI_API_KEY: ${{ github.event_name == 'pull_request' && secrets.OPENAI_API_KEY || '' }} PLAIN_OUTPUT: True - name: Upload coverage reports to Codecov -- cgit v1.2.3 From 3582ada3df619a454f2c12c095f24d088c9b3441 Mon Sep 17 00:00:00 2001 From: James Collins Date: Wed, 12 Jul 2023 10:21:20 -0700 Subject: Add links to github issues in the README and clarify run instructions (#4954) --- autogpt/core/README.md | 54 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/autogpt/core/README.md b/autogpt/core/README.md index f7bdf2d7d..c27fa28fe 100644 --- a/autogpt/core/README.md +++ b/autogpt/core/README.md @@ -1,7 +1,27 @@ -# Run instructions +# Auto-GPT Core + +This subpackage contains the ongoing work for the +[Auto-GPT Re-arch](https://github.com/Significant-Gravitas/Auto-GPT/issues/4770). It is +a work in progress and is not yet feature complete. In particular, it does not yet +have many of the Auto-GPT commands implemented and is pending ongoing work to +[re-incorporate vector-based memory and knowledge retrieval](https://github.com/Significant-Gravitas/Auto-GPT/issues/3536). + + +## Running the Re-arch Code There are two client applications for Auto-GPT included. +Unlike the main version of Auto-GPT, the re-arch requires you to actually install Auto-GPT in your python +environment to run this application. To do so, run + +``` +pip install -e REPOSITORY_ROOT +``` + +where `REPOSITORY_ROOT` is the root of the Auto-GPT repository on your machine. The `REPOSITORY_ROOT` +is the directory that contains the `setup.py` file and is the main, top-level directory of the repository +when you clone it. + ## CLI Application :star2: **This is the reference application I'm working with for now** :star2: @@ -11,21 +31,23 @@ The first app is a straight CLI application. I have not done anything yet to po - [Entry Point](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_app/cli.py) - [Client Application](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_app/main.py) -Auto-GPT must be installed in your python environment to run this application. To do so, run - -``` -pip install -e REPOSITORY_ROOT -``` - -where `REPOSITORY_ROOT` is the root of the Auto-GPT repository on your machine. - You'll then need a settings file. Run ``` python REPOSITORY_ROOT/autogpt/core/runner/cli_app/cli.py make-settings ``` -This will write a file called `default_agent_settings.yaml` with all the user-modifiable configuration keys to `~/auto-gpt/default_agent_settings.yml` and make the `auto-gpt` directory in your user directory if it doesn't exist). At a bare minimum, you'll need to set `openai.credentials.api_key` to your OpenAI API Key to run the model. +This will write a file called `default_agent_settings.yaml` with all the user-modifiable +configuration keys to `~/auto-gpt/default_agent_settings.yml` and make the `auto-gpt` directory +in your user directory if it doesn't exist). Your user directory is located in different places +depending on your operating system: + +- On Linux, it's `/home/USERNAME` +- On Windows, it's `C:\Users\USERNAME` +- On Mac, it's `/Users/USERNAME` + +At a bare minimum, you'll need to set `openai.credentials.api_key` to your OpenAI API Key to run +the model. You can then run Auto-GPT with @@ -35,9 +57,15 @@ python REPOSITORY_ROOT/autogpt/core/runner/cli_app/cli.py run to launch the interaction loop. -## CLI Web App +### CLI Web App -The second app is still a CLI, but it sets up a local webserver that the client application talks to rather than invoking calls to the Agent library code directly. This application is essentially a sketch at this point as the folks who were driving it have had less time (and likely not enough clarity) to proceed. +:warning: I am not actively developing this application. I am primarily working with the traditional CLI app +described above. It is a very good place to get involved if you have web application design experience and are +looking to get involved in the re-arch. + +The second app is still a CLI, but it sets up a local webserver that the client application talks to +rather than invoking calls to the Agent library code directly. This application is essentially a sketch +at this point as the folks who were driving it have had less time (and likely not enough clarity) to proceed. - [Entry Point](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_web_app/cli.py) - [Client Application](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_web_app/client/client.py) @@ -58,5 +86,3 @@ python REPOSITORY_ROOT/autogpt/core/runner/cli_web_app/cli.py client ``` This will launch a webserver and then start the client cli application to communicate with it. - -:warning: I am not actively developing this application. It is a very good place to get involved if you have web application design experience and are looking to get involved in the re-arch. \ No newline at end of file -- cgit v1.2.3 From e0b16cf4ac9a6edb83cdc67ed7d1d8161f3a8956 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 12 Jul 2023 10:54:50 -0700 Subject: Fix Smol developer and gpt engineer (#93) Signed-off-by: Merwane Hamadi --- agent/gpt-engineer | 2 +- agent/smol-developer | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agent/gpt-engineer b/agent/gpt-engineer index cde9be3e7..521d626c0 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333 +Subproject commit 521d626c0075ed6545f01b771757c856f8addbd6 diff --git a/agent/smol-developer b/agent/smol-developer index c52b14b1d..aa8233925 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d +Subproject commit aa8233925090c0c9314ceef68397ab37baf17766 -- cgit v1.2.3 From 48ac1c91cd85960d62928d9ab9bb66a8172e8f84 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 12 Jul 2023 14:30:06 -0700 Subject: Remove dependencies cache (#94) Signed-off-by: Merwane Hamadi --- .github/workflows/ci.yml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45bd64fff..34eedb292 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,14 +38,6 @@ jobs: run: | curl -sSL https://install.python-poetry.org | python - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - name: Install dependencies run: | poetry install @@ -107,14 +99,6 @@ jobs: run: | curl -sSL https://install.python-poetry.org | python - - - name: Set up Poetry cache - uses: actions/cache@v2 - with: - path: | - ~/.cache/pypoetry - .venv - key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }} - - name: Install dependencies run: | poetry install -- cgit v1.2.3 From 78df4915cf41e6fed0a8dc783102728e72825253 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 12 Jul 2023 14:35:12 -0700 Subject: Remove dependencies if a specific test is asked by the user (#95) Signed-off-by: Merwane Hamadi --- agbenchmark/conftest.py | 3 +++ agbenchmark/start_benchmark.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index b91b5f9f8..32151b8ad 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -85,6 +85,7 @@ def pytest_addoption(parser: Any) -> None: parser.addoption("--mock", action="store_true", default=False) parser.addoption("--improve", action="store_true", default=False) parser.addoption("--maintain", action="store_true", default=False) + parser.addoption("--test", action="store_true", default=None) @pytest.fixture(autouse=True) @@ -232,6 +233,8 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None: # Filter dependencies if they exist in regression data if its an improvement test if config.getoption("--improve"): dependencies = [dep for dep in dependencies if not data.get(dep, None)] + elif config.getoption("--test"): + dependencies = [] categories = test_class_instance.data.category diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index 917cd4e8a..ab2586e60 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -94,7 +94,7 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) - pytest_args = ["-vs"] if test: print("Running specific test:", test) - pytest_args.extend(["-k", test]) + pytest_args.extend(["-k", test, "--test"]) else: if category: pytest_args.extend(["-m", category]) -- cgit v1.2.3 From 077e143cc2074dea8bb930c4224b4afc13e5b501 Mon Sep 17 00:00:00 2001 From: James Collins Date: Wed, 12 Jul 2023 18:38:48 -0700 Subject: Documentation/collate rearch notes (#4958) * Add links to github issues in the README and clarify run instructions * Added a new doc in the core package with architecture notes. --- autogpt/core/ARCHITECTURE_NOTES.md | 272 +++++++++++++++++++++++++++++++++++++ autogpt/core/README.md | 6 + 2 files changed, 278 insertions(+) create mode 100644 autogpt/core/ARCHITECTURE_NOTES.md diff --git a/autogpt/core/ARCHITECTURE_NOTES.md b/autogpt/core/ARCHITECTURE_NOTES.md new file mode 100644 index 000000000..b9fd2021b --- /dev/null +++ b/autogpt/core/ARCHITECTURE_NOTES.md @@ -0,0 +1,272 @@ +# Re-architecture Notes + +## Key Documents + +- [Planned Agent Workflow](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ) +- [Original Architecture Diagram](https://www.figma.com/file/fwdj44tPR7ArYtnGGUKknw/Modular-Architecture?type=whiteboard&node-id=0-1) - This is sadly well out of date at this point. +- [Kanban](https://github.com/orgs/Significant-Gravitas/projects/1/views/1?filterQuery=label%3Are-arch) + +## The Motivation + +The `master` branch of Auto-GPT is an organically grown amalgamation of many thoughts +and ideas about agent-driven autonomous systems. It lacks clear abstraction boundaries, +has issues of global state and poorly encapsulated state, and is generally just hard to +make effective changes to. Mainly it's just a system that's hard to make changes to. +And research in the field is moving fast, so we want to be able to try new ideas +quickly. + +## Initial Planning + +A large group of maintainers and contributors met do discuss the architectural +challenges associated with the existing codebase. Many much-desired features (building +new user interfaces, enabling project-specific agents, enabling multi-agent systems) +are bottlenecked by the global state in the system. We discussed the tradeoffs between +an incremental system transition and a big breaking version change and decided to go +for the breaking version change. We justified this by saying: + +- We can maintain, in essence, the same user experience as now even with a radical + restructuring of the codebase +- Our developer audience is struggling to use the existing codebase to build + applications and libraries of their own, so this breaking change will largely be + welcome. + +## Primary Goals + +- Separate the AutoGPT application code from the library code. +- Remove global state from the system +- Allow for multiple agents per user (with facilities for running simultaneously) +- Create a serializable representation of an Agent +- Encapsulate the core systems in abstractions with clear boundaries. + +## Secondary goals + +- Use existing tools to ditch any unneccesary cruft in the codebase (document loading, + json parsing, anything easier to replace than to port). +- Bring in the [core agent loop updates](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ) + being developed simultaneously by @Pwuts + +# The Agent Subsystems + +## Configuration + +We want a lot of things from a configuration system. We lean heavily on it in the +`master` branch to allow several parts of the system to communicate with each other. +[Recent work](https://github.com/Significant-Gravitas/Auto-GPT/pull/4737) has made it +so that the config is no longer a singleton object that is materialized from the import +state, but it's still treated as a +[god object](https://en.wikipedia.org/wiki/God_object) containing all information about +the system and _critically_ allowing any system to reference configuration information +about other parts of the system. + +### What we want + +- It should still be reasonable to collate the entire system configuration in a + sensible way. +- The configuration should be validatable and validated. +- The system configuration should be a _serializable_ representation of an `Agent`. +- The configuration system should provide a clear (albeit very low-level) contract + about user-configurable aspects of the system. +- The configuration should reasonably manage default values and user-provided overrides. +- The configuration system needs to handle credentials in a reasonable way. +- The configuration should be the representation of some amount of system state, like + api budgets and resource usage. These aspects are recorded in the configuration and + updated by the system itself. +- Agent systems should have encapsulated views of the configuration. E.g. the memory + system should know about memory configuration but nothing about command configuration. + +## Workspace + +There are two ways to think about the workspace: + +- The workspace is a scratch space for an agent where it can store files, write code, + and do pretty much whatever else it likes. +- The workspace is, at any given point in time, the single source of truth for what an + agent is. It contains the serializable state (the configuration) as well as all + other working state (stored files, databases, memories, custom code). + +In the existing system there is **one** workspace. And because the workspace holds so +much agent state, that means a user can only work with one agent at a time. + +## Memory + +The memory system has been under extremely active development. +See [#3536](https://github.com/Significant-Gravitas/Auto-GPT/issues/3536) and +[#4208](https://github.com/Significant-Gravitas/Auto-GPT/pull/4208) for discussion and +work in the `master` branch. The TL;DR is +that we noticed a couple of months ago that the `Agent` performed **worse** with +permanent memory than without it. Since then the knowledge storage and retrieval +system has been [redesigned](https://whimsical.com/memory-system-8Ae6x6QkjDwQAUe9eVJ6w1) +and partially implemented in the `master` branch. + +## Planning/Prompt-Engineering + +The planning system is the system that translates user desires/agent intentions into +language model prompts. In the course of development, it has become pretty clear +that `Planning` is the wrong name for this system + +### What we want + +- It should be incredibly obvious what's being passed to a language model, when it's + being passed, and what the language model response is. The landscape of language + model research is developing very rapidly, so building complex abstractions between + users/contributors and the language model interactions is going to make it very + difficult for us to nimbly respond to new research developments. +- Prompt-engineering should ideally be exposed in a parameterizeable way to users. +- We should, where possible, leverage OpenAI's new + [function calling api](https://openai.com/blog/function-calling-and-other-api-updates) + to get outputs in a standard machine-readable format and avoid the deep pit of + parsing json (and fixing unparsable json). + +### Planning Strategies + +The [new agent workflow](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ) +has many, many interaction points for language models. We really would like to not +distribute prompt templates and raw strings all through the system. The re-arch solution +is to encapsulate language model interactions into planning strategies. +These strategies are defined by + +- The `LanguageModelClassification` they use (`FAST` or `SMART`) +- A function `build_prompt` that takes strategy specific arguments and constructs a + `LanguageModelPrompt` (a simple container for lists of messages and functions to + pass to the language model) +- A function `parse_content` that parses the response content (a dict) into a better + formatted dict. Contracts here are intentionally loose and will tighten once we have + at least one other language model provider. + +## Resources + +Resources are kinds of services we consume from external APIs. They may have associated +credentials and costs we need to manage. Management of those credentials is implemented +as manipulation of the resource configuration. We have two categories of resources +currently + +- AI/ML model providers (including language model providers and embedding model providers, ie OpenAI) +- Memory providers (e.g. Pinecone, Weaviate, ChromaDB, etc.) + +### What we want + +- Resource abstractions should provide a common interface to different service providers + for a particular kind of service. +- Resource abstractions should manipulate the configuration to manage their credentials + and budget/accounting. +- Resource abstractions should be composable over an API (e.g. I should be able to make + an OpenAI provider that is both a LanguageModelProvider and an EmbeddingModelProvider + and use it wherever I need those services). + +## Abilities + +Along with planning and memory usage, abilities are one of the major augmentations of +augmented language models. They allow us to expand the scope of what language models +can do by hooking them up to code they can execute to obtain new knowledge or influence +the world. + +### What we want + +- Abilities should have an extremely clear interface that users can write to. +- Abilities should have an extremely clear interface that a language model can + understand +- Abilities should be declarative about their dependencies so the system can inject them +- Abilities should be executable (where sensible) in an async run loop. +- Abilities should be not have side effects unless those side effects are clear in + their representation to an agent (e.g. the BrowseWeb ability shouldn't write a file, + but the WriteFile ability can). + +## Plugins + +Users want to add lots of features that we don't want to support as first-party. +Or solution to this is a plugin system to allow users to plug in their functionality or +to construct their agent from a public plugin marketplace. Our primary concern in the +re-arch is to build a stateless plugin service interface and a simple implementation +that can load plugins from installed packages or from zip files. Future efforts will +expand this system to allow plugins to load from a marketplace or some other kind +of service. + +### What is a Plugin + +Plugins are a kind of garbage term. They refer to a number of things. + +- New commands for the agent to execute. This is the most common usage. +- Replacements for entire subsystems like memory or language model providers +- Application plugins that do things like send emails or communicate via whatsapp +- The repositories contributors create that may themselves have multiple plugins in them. + +### Usage in the existing system + +The current plugin system is _hook-based_. This means plugins don't correspond to +kinds of objects in the system, but rather to times in the system at which we defer +execution to them. The main advantage of this setup is that user code can hijack +pretty much any behavior of the agent by injecting code that supercedes the normal +agent execution. The disadvantages to this approach are numerous: + +- We have absolutely no mechanisms to enforce any security measures because the threat + surface is everything. +- We cannot reason about agent behavior in a cohesive way because control flow can be + ceded to user code at pretty much any point and arbitrarily change or break the + agent behavior +- The interface for designing a plugin is kind of terrible and difficult to standardize +- The hook based implementation means we couple ourselves to a particular flow of + control (or otherwise risk breaking plugin behavior). E.g. many of the hook targets + in the [old workflow](https://whimsical.com/agent-workflow-VAzeKcup3SR7awpNZJKTyK) + are not present or mean something entirely different in the + [new workflow](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ). +- Etc. + +### What we want + +- A concrete definition of a plugin that is narrow enough in scope that we can define + it well and reason about how it will work in the system. +- A set of abstractions that let us define a plugin by its storage format and location +- A service interface that knows how to parse the plugin abstractions and turn them + into concrete classes and objects. + + +## Some Notes on how and why we'll use OO in this project + +First and foremost, Python itself is an object-oriented language. It's +underlying [data model](https://docs.python.org/3/reference/datamodel.html) is built +with object-oriented programming in mind. It offers useful tools like abstract base +classes to communicate interfaces to developers who want to, e.g., write plugins, or +help work on implementations. If we were working in a different language that offered +different tools, we'd use a different paradigm. + +While many things are classes in the re-arch, they are not classes in the same way. +There are three kinds of things (roughly) that are written as classes in the re-arch: +1. **Configuration**: Auto-GPT has *a lot* of configuration. This configuration + is *data* and we use **[Pydantic](https://docs.pydantic.dev/latest/)** to manage it as + pydantic is basically industry standard for this stuff. It provides runtime validation + for all the configuration and allows us to easily serialize configuration to both basic + python types (dicts, lists, and primatives) as well as serialize to json, which is + important for us being able to put representations of agents + [on the wire](https://en.wikipedia.org/wiki/Wire_protocol) for web applications and + agent-to-agent communication. *These are essentially + [structs](https://en.wikipedia.org/wiki/Struct_(C_programming_language)) rather than + traditional classes.* +2. **Internal Data**: Very similar to configuration, Auto-GPT passes around boatloads + of internal data. We are interacting with language models and language model APIs + which means we are handling lots of *structured* but *raw* text. Here we also + leverage **pydantic** to both *parse* and *validate* the internal data and also to + give us concrete types which we can use static type checkers to validate against + and discover problems before they show up as bugs at runtime. *These are + essentially [structs](https://en.wikipedia.org/wiki/Struct_(C_programming_language)) + rather than traditional classes.* +3. **System Interfaces**: This is our primary traditional use of classes in the + re-arch. We have a bunch of systems. We want many of those systems to have + alternative implementations (e.g. via plugins). We use abstract base classes to + define interfaces to communicate with people who might want to provide those + plugins. We provide a single concrete implementation of most of those systems as a + subclass of the interface. This should not be controversial. + +The approach is consistent with +[prior](https://github.com/Significant-Gravitas/Auto-GPT/issues/2458) +[work](https://github.com/Significant-Gravitas/Auto-GPT/pull/2442) done by other +maintainers in this direction. + +From an organization standpoint, OO programming is by far the most popular programming +paradigm (especially for Python). It's the one most often taught in programming classes +and the one with the most available online training for people interested in +contributing. + +Finally, and importantly, we scoped the plan and initial design of the re-arch as a +large group of maintainers and collaborators early on. This is consistent with the +design we chose and no-one offered alternatives. + \ No newline at end of file diff --git a/autogpt/core/README.md b/autogpt/core/README.md index c27fa28fe..e5bbc108e 100644 --- a/autogpt/core/README.md +++ b/autogpt/core/README.md @@ -6,6 +6,12 @@ a work in progress and is not yet feature complete. In particular, it does not have many of the Auto-GPT commands implemented and is pending ongoing work to [re-incorporate vector-based memory and knowledge retrieval](https://github.com/Significant-Gravitas/Auto-GPT/issues/3536). +## [Overview](ARCHITECTURE_NOTES.md) + +The Auto-GPT Re-arch is a re-implementation of the Auto-GPT agent that is designed to be more modular, +more extensible, and more maintainable than the original Auto-GPT agent. It is also designed to be +more accessible to new developers and to be easier to contribute to. The re-arch is a work in progress +and is not yet feature complete. It is also not yet ready for production use. ## Running the Re-arch Code -- cgit v1.2.3 From 4177c37b51d2f2c8f19c929cadd2609262682605 Mon Sep 17 00:00:00 2001 From: James Collins Date: Wed, 12 Jul 2023 19:36:00 -0700 Subject: Refactor/move functions in app to agent (#4957) * Add links to github issues in the README and clarify run instructions * Move things only used by the agent out of app.py and into the agent module * Fix busted dynamic import --- autogpt/agent/agent.py | 95 ++++++++++++++++++++++++++++++- autogpt/app.py | 114 ------------------------------------- autogpt/main.py | 1 - tests/unit/test_agent.py | 57 +++++++------------ tests/unit/test_execute_command.py | 23 -------- 5 files changed, 111 insertions(+), 179 deletions(-) delete mode 100644 autogpt/app.py delete mode 100644 tests/unit/test_execute_command.py diff --git a/autogpt/agent/agent.py b/autogpt/agent/agent.py index 88b3fa809..316cc4d44 100644 --- a/autogpt/agent/agent.py +++ b/autogpt/agent/agent.py @@ -9,6 +9,7 @@ from colorama import Fore, Style from autogpt.config import Config from autogpt.config.ai_config import AIConfig from autogpt.json_utils.utilities import extract_json_from_response, validate_json +from autogpt.llm import ChatModelResponse from autogpt.llm.chat import chat_with_ai from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS from autogpt.llm.utils import count_string_tokens @@ -86,9 +87,6 @@ class Agent: self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens def start_interaction_loop(self): - # Avoid circular imports - from autogpt.app import execute_command, extract_command - # Interaction Loop self.cycle_count = 0 command_name = None @@ -307,3 +305,94 @@ class Agent: logger.typewriter_log( "SYSTEM: ", Fore.YELLOW, "Unable to execute command" ) + + +def extract_command( + assistant_reply_json: dict, assistant_reply: ChatModelResponse, config: Config +): + """Parse the response and return the command name and arguments + + Args: + assistant_reply_json (dict): The response object from the AI + assistant_reply (ChatModelResponse): The model response from the AI + config (Config): The config object + + Returns: + tuple: The command name and arguments + + Raises: + json.decoder.JSONDecodeError: If the response is not valid JSON + + Exception: If any other error occurs + """ + if config.openai_functions: + if assistant_reply.function_call is None: + return "Error:", "No 'function_call' in assistant reply" + assistant_reply_json["command"] = { + "name": assistant_reply.function_call.name, + "args": json.loads(assistant_reply.function_call.arguments), + } + try: + if "command" not in assistant_reply_json: + return "Error:", "Missing 'command' object in JSON" + + if not isinstance(assistant_reply_json, dict): + return ( + "Error:", + f"The previous message sent was not a dictionary {assistant_reply_json}", + ) + + command = assistant_reply_json["command"] + if not isinstance(command, dict): + return "Error:", "'command' object is not a dictionary" + + if "name" not in command: + return "Error:", "Missing 'name' field in 'command' object" + + command_name = command["name"] + + # Use an empty dictionary if 'args' field is not present in 'command' object + arguments = command.get("args", {}) + + return command_name, arguments + except json.decoder.JSONDecodeError: + return "Error:", "Invalid JSON" + # All other errors, return "Error: + error message" + except Exception as e: + return "Error:", str(e) + + +def execute_command( + command_name: str, + arguments: dict[str, str], + agent: Agent, +): + """Execute the command and return the result + + Args: + command_name (str): The name of the command to execute + arguments (dict): The arguments for the command + agent (Agent): The agent that is executing the command + + Returns: + str: The result of the command + """ + try: + # Execute a native command with the same name or alias, if it exists + if command := agent.command_registry.get_command(command_name): + return command(**arguments, agent=agent) + + # Handle non-native commands (e.g. from plugins) + for command in agent.ai_config.prompt_generator.commands: + if ( + command_name == command["label"].lower() + or command_name == command["name"].lower() + ): + return command["function"](**arguments) + + raise RuntimeError( + f"Cannot execute '{command_name}': unknown command." + " Do not try to use this command again." + ) + except Exception as e: + return f"Error: {str(e)}" diff --git a/autogpt/app.py b/autogpt/app.py deleted file mode 100644 index ea5072f81..000000000 --- a/autogpt/app.py +++ /dev/null @@ -1,114 +0,0 @@ -""" Command and Control """ -import json -from typing import Dict - -from autogpt.agent.agent import Agent -from autogpt.config import Config -from autogpt.llm import ChatModelResponse - - -def is_valid_int(value: str) -> bool: - """Check if the value is a valid integer - - Args: - value (str): The value to check - - Returns: - bool: True if the value is a valid integer, False otherwise - """ - try: - int(value) - return True - except ValueError: - return False - - -def extract_command( - assistant_reply_json: Dict, assistant_reply: ChatModelResponse, config: Config -): - """Parse the response and return the command name and arguments - - Args: - assistant_reply_json (dict): The response object from the AI - assistant_reply (ChatModelResponse): The model response from the AI - config (Config): The config object - - Returns: - tuple: The command name and arguments - - Raises: - json.decoder.JSONDecodeError: If the response is not valid JSON - - Exception: If any other error occurs - """ - if config.openai_functions: - if assistant_reply.function_call is None: - return "Error:", "No 'function_call' in assistant reply" - assistant_reply_json["command"] = { - "name": assistant_reply.function_call.name, - "args": json.loads(assistant_reply.function_call.arguments), - } - try: - if "command" not in assistant_reply_json: - return "Error:", "Missing 'command' object in JSON" - - if not isinstance(assistant_reply_json, dict): - return ( - "Error:", - f"The previous message sent was not a dictionary {assistant_reply_json}", - ) - - command = assistant_reply_json["command"] - if not isinstance(command, dict): - return "Error:", "'command' object is not a dictionary" - - if "name" not in command: - return "Error:", "Missing 'name' field in 'command' object" - - command_name = command["name"] - - # Use an empty dictionary if 'args' field is not present in 'command' object - arguments = command.get("args", {}) - - return command_name, arguments - except json.decoder.JSONDecodeError: - return "Error:", "Invalid JSON" - # All other errors, return "Error: + error message" - except Exception as e: - return "Error:", str(e) - - -def execute_command( - command_name: str, - arguments: dict[str, str], - agent: Agent, -): - """Execute the command and return the result - - Args: - command_name (str): The name of the command to execute - arguments (dict): The arguments for the command - agent (Agent): The agent that is executing the command - - Returns: - str: The result of the command - """ - try: - # Execute a native command with the same name or alias, if it exists - if command := agent.command_registry.get_command(command_name): - return command(**arguments, agent=agent) - - # Handle non-native commands (e.g. from plugins) - for command in agent.ai_config.prompt_generator.commands: - if ( - command_name == command["label"].lower() - or command_name == command["name"].lower() - ): - return command["function"](**arguments) - - raise RuntimeError( - f"Cannot execute '{command_name}': unknown command." - " Do not try to use this command again." - ) - except Exception as e: - return f"Error: {str(e)}" diff --git a/autogpt/main.py b/autogpt/main.py index 08ac4b400..4ef3fc949 100644 --- a/autogpt/main.py +++ b/autogpt/main.py @@ -28,7 +28,6 @@ COMMAND_CATEGORIES = [ "autogpt.commands.file_operations", "autogpt.commands.web_search", "autogpt.commands.web_selenium", - "autogpt.app", "autogpt.commands.task_statuses", ] diff --git a/tests/unit/test_agent.py b/tests/unit/test_agent.py index 3fb896bad..351454be0 100644 --- a/tests/unit/test_agent.py +++ b/tests/unit/test_agent.py @@ -1,46 +1,27 @@ -from unittest.mock import MagicMock - -import pytest - -from autogpt.agent import Agent -from autogpt.config import AIConfig -from autogpt.config.config import Config - - -@pytest.fixture -def agent(config: Config): - ai_name = "Test AI" - memory = MagicMock() - next_action_count = 0 - command_registry = MagicMock() - ai_config = AIConfig(ai_name=ai_name) - system_prompt = "System prompt" - triggering_prompt = "Triggering prompt" - workspace_directory = "workspace_directory" - - agent = Agent( - ai_name=ai_name, - memory=memory, - next_action_count=next_action_count, - command_registry=command_registry, - ai_config=ai_config, - config=config, - system_prompt=system_prompt, - triggering_prompt=triggering_prompt, - workspace_directory=workspace_directory, - ) - return agent +from autogpt.agent.agent import Agent, execute_command def test_agent_initialization(agent: Agent): - assert agent.ai_name == "Test AI" - assert agent.memory == agent.memory + assert agent.ai_name == "Base" assert agent.history.messages == [] assert agent.next_action_count == 0 - assert agent.command_registry == agent.command_registry - assert agent.ai_config == agent.ai_config - assert agent.system_prompt == "System prompt" - assert agent.triggering_prompt == "Triggering prompt" + + +def test_execute_command_plugin(agent: Agent): + """Test that executing a command that came from a plugin works as expected""" + command_name = "check_plan" + agent.ai_config.prompt_generator.add_command( + command_name, + "Read the plan.md with the next goals to achieve", + {}, + lambda: "hi", + ) + command_result = execute_command( + command_name=command_name, + arguments={}, + agent=agent, + ) + assert command_result == "hi" # More test methods can be added for specific agent interactions diff --git a/tests/unit/test_execute_command.py b/tests/unit/test_execute_command.py deleted file mode 100644 index 21fb0b66e..000000000 --- a/tests/unit/test_execute_command.py +++ /dev/null @@ -1,23 +0,0 @@ -from autogpt.agent import Agent -from autogpt.app import execute_command - - -def check_plan(): - return "hi" - - -def test_execute_command_plugin(agent: Agent): - """Test that executing a command that came from a plugin works as expected""" - command_name = "check_plan" - agent.ai_config.prompt_generator.add_command( - command_name, - "Read the plan.md with the next goals to achieve", - {}, - check_plan, - ) - command_result = execute_command( - command_name=command_name, - arguments={}, - agent=agent, - ) - assert command_result == "hi" -- cgit v1.2.3 From c9adedf746758817913f334098110350fb21f8ce Mon Sep 17 00:00:00 2001 From: James Collins Date: Thu, 13 Jul 2023 07:31:49 -0700 Subject: Refactor/rename agent subpackage to agents (#4961) * Add links to github issues in the README and clarify run instructions * Rename agent subpackage to agents * Revert all unwanted changes * Use relative import in `agents/__init__.py` --------- Co-authored-by: Reinier van der Leer --- autogpt/agent/__init__.py | 3 - autogpt/agent/agent.py | 398 --------------------- autogpt/agents/__init__.py | 3 + autogpt/agents/agent.py | 398 +++++++++++++++++++++ autogpt/commands/decorators.py | 2 +- autogpt/commands/execute_code.py | 2 +- autogpt/commands/file_operations.py | 2 +- autogpt/commands/git_operations.py | 2 +- autogpt/commands/image_gen.py | 2 +- autogpt/commands/task_statuses.py | 2 +- autogpt/commands/web_search.py | 2 +- autogpt/commands/web_selenium.py | 2 +- autogpt/llm/chat.py | 2 +- autogpt/main.py | 2 +- autogpt/memory/message_history.py | 2 +- benchmarks.py | 2 +- .../debug_code/test_debug_code_challenge_a.py | 2 +- tests/conftest.py | 2 +- tests/integration/agent_factory.py | 2 +- tests/integration/test_execute_code.py | 2 +- tests/integration/test_image_gen.py | 2 +- tests/integration/test_web_selenium.py | 2 +- tests/unit/test_agent.py | 2 +- tests/unit/test_file_operations.py | 2 +- tests/unit/test_git_commands.py | 2 +- tests/unit/test_message_history.py | 2 +- tests/unit/test_web_search.py | 2 +- 27 files changed, 424 insertions(+), 424 deletions(-) delete mode 100644 autogpt/agent/__init__.py delete mode 100644 autogpt/agent/agent.py create mode 100644 autogpt/agents/__init__.py create mode 100644 autogpt/agents/agent.py diff --git a/autogpt/agent/__init__.py b/autogpt/agent/__init__.py deleted file mode 100644 index 90d1148c2..000000000 --- a/autogpt/agent/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from autogpt.agent.agent import Agent - -__all__ = ["Agent"] diff --git a/autogpt/agent/agent.py b/autogpt/agent/agent.py deleted file mode 100644 index 316cc4d44..000000000 --- a/autogpt/agent/agent.py +++ /dev/null @@ -1,398 +0,0 @@ -import json -import signal -import sys -from datetime import datetime -from pathlib import Path - -from colorama import Fore, Style - -from autogpt.config import Config -from autogpt.config.ai_config import AIConfig -from autogpt.json_utils.utilities import extract_json_from_response, validate_json -from autogpt.llm import ChatModelResponse -from autogpt.llm.chat import chat_with_ai -from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS -from autogpt.llm.utils import count_string_tokens -from autogpt.logs import ( - FULL_MESSAGE_HISTORY_FILE_NAME, - NEXT_ACTION_FILE_NAME, - USER_INPUT_FILE_NAME, - LogCycleHandler, - logger, - print_assistant_thoughts, - remove_ansi_escape, -) -from autogpt.memory.message_history import MessageHistory -from autogpt.memory.vector import VectorMemory -from autogpt.models.command_registry import CommandRegistry -from autogpt.speech import say_text -from autogpt.spinner import Spinner -from autogpt.utils import clean_input -from autogpt.workspace import Workspace - - -class Agent: - """Agent class for interacting with Auto-GPT. - - Attributes: - ai_name: The name of the agent. - memory: The memory object to use. - next_action_count: The number of actions to execute. - system_prompt: The system prompt is the initial prompt that defines everything - the AI needs to know to achieve its task successfully. - Currently, the dynamic and customizable information in the system prompt are - ai_name, description and goals. - - triggering_prompt: The last sentence the AI will see before answering. - For Auto-GPT, this prompt is: - Determine exactly one command to use, and respond using the format specified - above: - The triggering prompt is not part of the system prompt because between the - system prompt and the triggering - prompt we have contextual information that can distract the AI and make it - forget that its goal is to find the next task to achieve. - SYSTEM PROMPT - CONTEXTUAL INFORMATION (memory, previous conversations, anything relevant) - TRIGGERING PROMPT - - The triggering prompt reminds the AI about its short term meta task - (defining the next task) - """ - - def __init__( - self, - ai_name: str, - memory: VectorMemory, - next_action_count: int, - command_registry: CommandRegistry, - ai_config: AIConfig, - system_prompt: str, - triggering_prompt: str, - workspace_directory: str | Path, - config: Config, - ): - self.ai_name = ai_name - self.memory = memory - self.history = MessageHistory.for_model(config.smart_llm, agent=self) - self.next_action_count = next_action_count - self.command_registry = command_registry - self.config = config - self.ai_config = ai_config - self.system_prompt = system_prompt - self.triggering_prompt = triggering_prompt - self.workspace = Workspace(workspace_directory, config.restrict_to_workspace) - self.created_at = datetime.now().strftime("%Y%m%d_%H%M%S") - self.cycle_count = 0 - self.log_cycle_handler = LogCycleHandler() - self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens - - def start_interaction_loop(self): - # Interaction Loop - self.cycle_count = 0 - command_name = None - arguments = None - user_input = "" - - # Signal handler for interrupting y -N - def signal_handler(signum, frame): - if self.next_action_count == 0: - sys.exit() - else: - print( - Fore.RED - + "Interrupt signal received. Stopping continuous command execution." - + Style.RESET_ALL - ) - self.next_action_count = 0 - - signal.signal(signal.SIGINT, signal_handler) - - while True: - # Discontinue if continuous limit is reached - self.cycle_count += 1 - self.log_cycle_handler.log_count_within_cycle = 0 - self.log_cycle_handler.log_cycle( - self.ai_config.ai_name, - self.created_at, - self.cycle_count, - [m.raw() for m in self.history], - FULL_MESSAGE_HISTORY_FILE_NAME, - ) - if ( - self.config.continuous_mode - and self.config.continuous_limit > 0 - and self.cycle_count > self.config.continuous_limit - ): - logger.typewriter_log( - "Continuous Limit Reached: ", - Fore.YELLOW, - f"{self.config.continuous_limit}", - ) - break - # Send message to AI, get response - with Spinner("Thinking... ", plain_output=self.config.plain_output): - assistant_reply = chat_with_ai( - self.config, - self, - self.system_prompt, - self.triggering_prompt, - self.smart_token_limit, - self.config.smart_llm, - ) - - try: - assistant_reply_json = extract_json_from_response( - assistant_reply.content - ) - validate_json(assistant_reply_json, self.config) - except json.JSONDecodeError as e: - logger.error(f"Exception while validating assistant reply JSON: {e}") - assistant_reply_json = {} - - for plugin in self.config.plugins: - if not plugin.can_handle_post_planning(): - continue - assistant_reply_json = plugin.post_planning(assistant_reply_json) - - # Print Assistant thoughts - if assistant_reply_json != {}: - # Get command name and arguments - try: - print_assistant_thoughts( - self.ai_name, assistant_reply_json, self.config - ) - command_name, arguments = extract_command( - assistant_reply_json, assistant_reply, self.config - ) - if self.config.speak_mode: - say_text(f"I want to execute {command_name}", self.config) - - except Exception as e: - logger.error("Error: \n", str(e)) - self.log_cycle_handler.log_cycle( - self.ai_config.ai_name, - self.created_at, - self.cycle_count, - assistant_reply_json, - NEXT_ACTION_FILE_NAME, - ) - - # First log new-line so user can differentiate sections better in console - logger.typewriter_log("\n") - logger.typewriter_log( - "NEXT ACTION: ", - Fore.CYAN, - f"COMMAND = {Fore.CYAN}{remove_ansi_escape(command_name)}{Style.RESET_ALL} " - f"ARGUMENTS = {Fore.CYAN}{arguments}{Style.RESET_ALL}", - ) - - if not self.config.continuous_mode and self.next_action_count == 0: - # ### GET USER AUTHORIZATION TO EXECUTE COMMAND ### - # Get key press: Prompt the user to press enter to continue or escape - # to exit - self.user_input = "" - logger.info( - f"Enter '{self.config.authorise_key}' to authorise command, " - f"'{self.config.authorise_key} -N' to run N continuous commands, " - f"'{self.config.exit_key}' to exit program, or enter feedback for " - f"{self.ai_name}..." - ) - while True: - if self.config.chat_messages_enabled: - console_input = clean_input( - self.config, "Waiting for your response..." - ) - else: - console_input = clean_input( - self.config, Fore.MAGENTA + "Input:" + Style.RESET_ALL - ) - if console_input.lower().strip() == self.config.authorise_key: - user_input = "GENERATE NEXT COMMAND JSON" - break - elif console_input.lower().strip() == "": - logger.warn("Invalid input format.") - continue - elif console_input.lower().startswith( - f"{self.config.authorise_key} -" - ): - try: - self.next_action_count = abs( - int(console_input.split(" ")[1]) - ) - user_input = "GENERATE NEXT COMMAND JSON" - except ValueError: - logger.warn( - f"Invalid input format. Please enter '{self.config.authorise_key} -n' " - "where n is the number of continuous tasks." - ) - continue - break - elif console_input.lower() == self.config.exit_key: - user_input = "EXIT" - break - else: - user_input = console_input - command_name = "human_feedback" - self.log_cycle_handler.log_cycle( - self.ai_config.ai_name, - self.created_at, - self.cycle_count, - user_input, - USER_INPUT_FILE_NAME, - ) - break - - if user_input == "GENERATE NEXT COMMAND JSON": - logger.typewriter_log( - "-=-=-=-=-=-=-= COMMAND AUTHORISED BY USER -=-=-=-=-=-=-=", - Fore.MAGENTA, - "", - ) - elif user_input == "EXIT": - logger.info("Exiting...") - break - else: - # First log new-line so user can differentiate sections better in console - logger.typewriter_log("\n") - # Print authorized commands left value - logger.typewriter_log( - f"{Fore.CYAN}AUTHORISED COMMANDS LEFT: {Style.RESET_ALL}{self.next_action_count}" - ) - - # Execute command - if command_name is not None and command_name.lower().startswith("error"): - result = f"Could not execute command: {arguments}" - elif command_name == "human_feedback": - result = f"Human feedback: {user_input}" - else: - for plugin in self.config.plugins: - if not plugin.can_handle_pre_command(): - continue - command_name, arguments = plugin.pre_command( - command_name, arguments - ) - command_result = execute_command( - command_name=command_name, - arguments=arguments, - agent=self, - ) - result = f"Command {command_name} returned: " f"{command_result}" - - result_tlength = count_string_tokens( - str(command_result), self.config.smart_llm - ) - memory_tlength = count_string_tokens( - str(self.history.summary_message()), self.config.smart_llm - ) - if result_tlength + memory_tlength + 600 > self.smart_token_limit: - result = f"Failure: command {command_name} returned too much output. \ - Do not execute this command again with the same arguments." - - for plugin in self.config.plugins: - if not plugin.can_handle_post_command(): - continue - result = plugin.post_command(command_name, result) - if self.next_action_count > 0: - self.next_action_count -= 1 - - # Check if there's a result from the command append it to the message - # history - if result is not None: - self.history.add("system", result, "action_result") - logger.typewriter_log("SYSTEM: ", Fore.YELLOW, result) - else: - self.history.add("system", "Unable to execute command", "action_result") - logger.typewriter_log( - "SYSTEM: ", Fore.YELLOW, "Unable to execute command" - ) - - -def extract_command( - assistant_reply_json: dict, assistant_reply: ChatModelResponse, config: Config -): - """Parse the response and return the command name and arguments - - Args: - assistant_reply_json (dict): The response object from the AI - assistant_reply (ChatModelResponse): The model response from the AI - config (Config): The config object - - Returns: - tuple: The command name and arguments - - Raises: - json.decoder.JSONDecodeError: If the response is not valid JSON - - Exception: If any other error occurs - """ - if config.openai_functions: - if assistant_reply.function_call is None: - return "Error:", "No 'function_call' in assistant reply" - assistant_reply_json["command"] = { - "name": assistant_reply.function_call.name, - "args": json.loads(assistant_reply.function_call.arguments), - } - try: - if "command" not in assistant_reply_json: - return "Error:", "Missing 'command' object in JSON" - - if not isinstance(assistant_reply_json, dict): - return ( - "Error:", - f"The previous message sent was not a dictionary {assistant_reply_json}", - ) - - command = assistant_reply_json["command"] - if not isinstance(command, dict): - return "Error:", "'command' object is not a dictionary" - - if "name" not in command: - return "Error:", "Missing 'name' field in 'command' object" - - command_name = command["name"] - - # Use an empty dictionary if 'args' field is not present in 'command' object - arguments = command.get("args", {}) - - return command_name, arguments - except json.decoder.JSONDecodeError: - return "Error:", "Invalid JSON" - # All other errors, return "Error: + error message" - except Exception as e: - return "Error:", str(e) - - -def execute_command( - command_name: str, - arguments: dict[str, str], - agent: Agent, -): - """Execute the command and return the result - - Args: - command_name (str): The name of the command to execute - arguments (dict): The arguments for the command - agent (Agent): The agent that is executing the command - - Returns: - str: The result of the command - """ - try: - # Execute a native command with the same name or alias, if it exists - if command := agent.command_registry.get_command(command_name): - return command(**arguments, agent=agent) - - # Handle non-native commands (e.g. from plugins) - for command in agent.ai_config.prompt_generator.commands: - if ( - command_name == command["label"].lower() - or command_name == command["name"].lower() - ): - return command["function"](**arguments) - - raise RuntimeError( - f"Cannot execute '{command_name}': unknown command." - " Do not try to use this command again." - ) - except Exception as e: - return f"Error: {str(e)}" diff --git a/autogpt/agents/__init__.py b/autogpt/agents/__init__.py new file mode 100644 index 000000000..a6df24ad7 --- /dev/null +++ b/autogpt/agents/__init__.py @@ -0,0 +1,3 @@ +from .agent import Agent + +__all__ = ["Agent"] diff --git a/autogpt/agents/agent.py b/autogpt/agents/agent.py new file mode 100644 index 000000000..316cc4d44 --- /dev/null +++ b/autogpt/agents/agent.py @@ -0,0 +1,398 @@ +import json +import signal +import sys +from datetime import datetime +from pathlib import Path + +from colorama import Fore, Style + +from autogpt.config import Config +from autogpt.config.ai_config import AIConfig +from autogpt.json_utils.utilities import extract_json_from_response, validate_json +from autogpt.llm import ChatModelResponse +from autogpt.llm.chat import chat_with_ai +from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS +from autogpt.llm.utils import count_string_tokens +from autogpt.logs import ( + FULL_MESSAGE_HISTORY_FILE_NAME, + NEXT_ACTION_FILE_NAME, + USER_INPUT_FILE_NAME, + LogCycleHandler, + logger, + print_assistant_thoughts, + remove_ansi_escape, +) +from autogpt.memory.message_history import MessageHistory +from autogpt.memory.vector import VectorMemory +from autogpt.models.command_registry import CommandRegistry +from autogpt.speech import say_text +from autogpt.spinner import Spinner +from autogpt.utils import clean_input +from autogpt.workspace import Workspace + + +class Agent: + """Agent class for interacting with Auto-GPT. + + Attributes: + ai_name: The name of the agent. + memory: The memory object to use. + next_action_count: The number of actions to execute. + system_prompt: The system prompt is the initial prompt that defines everything + the AI needs to know to achieve its task successfully. + Currently, the dynamic and customizable information in the system prompt are + ai_name, description and goals. + + triggering_prompt: The last sentence the AI will see before answering. + For Auto-GPT, this prompt is: + Determine exactly one command to use, and respond using the format specified + above: + The triggering prompt is not part of the system prompt because between the + system prompt and the triggering + prompt we have contextual information that can distract the AI and make it + forget that its goal is to find the next task to achieve. + SYSTEM PROMPT + CONTEXTUAL INFORMATION (memory, previous conversations, anything relevant) + TRIGGERING PROMPT + + The triggering prompt reminds the AI about its short term meta task + (defining the next task) + """ + + def __init__( + self, + ai_name: str, + memory: VectorMemory, + next_action_count: int, + command_registry: CommandRegistry, + ai_config: AIConfig, + system_prompt: str, + triggering_prompt: str, + workspace_directory: str | Path, + config: Config, + ): + self.ai_name = ai_name + self.memory = memory + self.history = MessageHistory.for_model(config.smart_llm, agent=self) + self.next_action_count = next_action_count + self.command_registry = command_registry + self.config = config + self.ai_config = ai_config + self.system_prompt = system_prompt + self.triggering_prompt = triggering_prompt + self.workspace = Workspace(workspace_directory, config.restrict_to_workspace) + self.created_at = datetime.now().strftime("%Y%m%d_%H%M%S") + self.cycle_count = 0 + self.log_cycle_handler = LogCycleHandler() + self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens + + def start_interaction_loop(self): + # Interaction Loop + self.cycle_count = 0 + command_name = None + arguments = None + user_input = "" + + # Signal handler for interrupting y -N + def signal_handler(signum, frame): + if self.next_action_count == 0: + sys.exit() + else: + print( + Fore.RED + + "Interrupt signal received. Stopping continuous command execution." + + Style.RESET_ALL + ) + self.next_action_count = 0 + + signal.signal(signal.SIGINT, signal_handler) + + while True: + # Discontinue if continuous limit is reached + self.cycle_count += 1 + self.log_cycle_handler.log_count_within_cycle = 0 + self.log_cycle_handler.log_cycle( + self.ai_config.ai_name, + self.created_at, + self.cycle_count, + [m.raw() for m in self.history], + FULL_MESSAGE_HISTORY_FILE_NAME, + ) + if ( + self.config.continuous_mode + and self.config.continuous_limit > 0 + and self.cycle_count > self.config.continuous_limit + ): + logger.typewriter_log( + "Continuous Limit Reached: ", + Fore.YELLOW, + f"{self.config.continuous_limit}", + ) + break + # Send message to AI, get response + with Spinner("Thinking... ", plain_output=self.config.plain_output): + assistant_reply = chat_with_ai( + self.config, + self, + self.system_prompt, + self.triggering_prompt, + self.smart_token_limit, + self.config.smart_llm, + ) + + try: + assistant_reply_json = extract_json_from_response( + assistant_reply.content + ) + validate_json(assistant_reply_json, self.config) + except json.JSONDecodeError as e: + logger.error(f"Exception while validating assistant reply JSON: {e}") + assistant_reply_json = {} + + for plugin in self.config.plugins: + if not plugin.can_handle_post_planning(): + continue + assistant_reply_json = plugin.post_planning(assistant_reply_json) + + # Print Assistant thoughts + if assistant_reply_json != {}: + # Get command name and arguments + try: + print_assistant_thoughts( + self.ai_name, assistant_reply_json, self.config + ) + command_name, arguments = extract_command( + assistant_reply_json, assistant_reply, self.config + ) + if self.config.speak_mode: + say_text(f"I want to execute {command_name}", self.config) + + except Exception as e: + logger.error("Error: \n", str(e)) + self.log_cycle_handler.log_cycle( + self.ai_config.ai_name, + self.created_at, + self.cycle_count, + assistant_reply_json, + NEXT_ACTION_FILE_NAME, + ) + + # First log new-line so user can differentiate sections better in console + logger.typewriter_log("\n") + logger.typewriter_log( + "NEXT ACTION: ", + Fore.CYAN, + f"COMMAND = {Fore.CYAN}{remove_ansi_escape(command_name)}{Style.RESET_ALL} " + f"ARGUMENTS = {Fore.CYAN}{arguments}{Style.RESET_ALL}", + ) + + if not self.config.continuous_mode and self.next_action_count == 0: + # ### GET USER AUTHORIZATION TO EXECUTE COMMAND ### + # Get key press: Prompt the user to press enter to continue or escape + # to exit + self.user_input = "" + logger.info( + f"Enter '{self.config.authorise_key}' to authorise command, " + f"'{self.config.authorise_key} -N' to run N continuous commands, " + f"'{self.config.exit_key}' to exit program, or enter feedback for " + f"{self.ai_name}..." + ) + while True: + if self.config.chat_messages_enabled: + console_input = clean_input( + self.config, "Waiting for your response..." + ) + else: + console_input = clean_input( + self.config, Fore.MAGENTA + "Input:" + Style.RESET_ALL + ) + if console_input.lower().strip() == self.config.authorise_key: + user_input = "GENERATE NEXT COMMAND JSON" + break + elif console_input.lower().strip() == "": + logger.warn("Invalid input format.") + continue + elif console_input.lower().startswith( + f"{self.config.authorise_key} -" + ): + try: + self.next_action_count = abs( + int(console_input.split(" ")[1]) + ) + user_input = "GENERATE NEXT COMMAND JSON" + except ValueError: + logger.warn( + f"Invalid input format. Please enter '{self.config.authorise_key} -n' " + "where n is the number of continuous tasks." + ) + continue + break + elif console_input.lower() == self.config.exit_key: + user_input = "EXIT" + break + else: + user_input = console_input + command_name = "human_feedback" + self.log_cycle_handler.log_cycle( + self.ai_config.ai_name, + self.created_at, + self.cycle_count, + user_input, + USER_INPUT_FILE_NAME, + ) + break + + if user_input == "GENERATE NEXT COMMAND JSON": + logger.typewriter_log( + "-=-=-=-=-=-=-= COMMAND AUTHORISED BY USER -=-=-=-=-=-=-=", + Fore.MAGENTA, + "", + ) + elif user_input == "EXIT": + logger.info("Exiting...") + break + else: + # First log new-line so user can differentiate sections better in console + logger.typewriter_log("\n") + # Print authorized commands left value + logger.typewriter_log( + f"{Fore.CYAN}AUTHORISED COMMANDS LEFT: {Style.RESET_ALL}{self.next_action_count}" + ) + + # Execute command + if command_name is not None and command_name.lower().startswith("error"): + result = f"Could not execute command: {arguments}" + elif command_name == "human_feedback": + result = f"Human feedback: {user_input}" + else: + for plugin in self.config.plugins: + if not plugin.can_handle_pre_command(): + continue + command_name, arguments = plugin.pre_command( + command_name, arguments + ) + command_result = execute_command( + command_name=command_name, + arguments=arguments, + agent=self, + ) + result = f"Command {command_name} returned: " f"{command_result}" + + result_tlength = count_string_tokens( + str(command_result), self.config.smart_llm + ) + memory_tlength = count_string_tokens( + str(self.history.summary_message()), self.config.smart_llm + ) + if result_tlength + memory_tlength + 600 > self.smart_token_limit: + result = f"Failure: command {command_name} returned too much output. \ + Do not execute this command again with the same arguments." + + for plugin in self.config.plugins: + if not plugin.can_handle_post_command(): + continue + result = plugin.post_command(command_name, result) + if self.next_action_count > 0: + self.next_action_count -= 1 + + # Check if there's a result from the command append it to the message + # history + if result is not None: + self.history.add("system", result, "action_result") + logger.typewriter_log("SYSTEM: ", Fore.YELLOW, result) + else: + self.history.add("system", "Unable to execute command", "action_result") + logger.typewriter_log( + "SYSTEM: ", Fore.YELLOW, "Unable to execute command" + ) + + +def extract_command( + assistant_reply_json: dict, assistant_reply: ChatModelResponse, config: Config +): + """Parse the response and return the command name and arguments + + Args: + assistant_reply_json (dict): The response object from the AI + assistant_reply (ChatModelResponse): The model response from the AI + config (Config): The config object + + Returns: + tuple: The command name and arguments + + Raises: + json.decoder.JSONDecodeError: If the response is not valid JSON + + Exception: If any other error occurs + """ + if config.openai_functions: + if assistant_reply.function_call is None: + return "Error:", "No 'function_call' in assistant reply" + assistant_reply_json["command"] = { + "name": assistant_reply.function_call.name, + "args": json.loads(assistant_reply.function_call.arguments), + } + try: + if "command" not in assistant_reply_json: + return "Error:", "Missing 'command' object in JSON" + + if not isinstance(assistant_reply_json, dict): + return ( + "Error:", + f"The previous message sent was not a dictionary {assistant_reply_json}", + ) + + command = assistant_reply_json["command"] + if not isinstance(command, dict): + return "Error:", "'command' object is not a dictionary" + + if "name" not in command: + return "Error:", "Missing 'name' field in 'command' object" + + command_name = command["name"] + + # Use an empty dictionary if 'args' field is not present in 'command' object + arguments = command.get("args", {}) + + return command_name, arguments + except json.decoder.JSONDecodeError: + return "Error:", "Invalid JSON" + # All other errors, return "Error: + error message" + except Exception as e: + return "Error:", str(e) + + +def execute_command( + command_name: str, + arguments: dict[str, str], + agent: Agent, +): + """Execute the command and return the result + + Args: + command_name (str): The name of the command to execute + arguments (dict): The arguments for the command + agent (Agent): The agent that is executing the command + + Returns: + str: The result of the command + """ + try: + # Execute a native command with the same name or alias, if it exists + if command := agent.command_registry.get_command(command_name): + return command(**arguments, agent=agent) + + # Handle non-native commands (e.g. from plugins) + for command in agent.ai_config.prompt_generator.commands: + if ( + command_name == command["label"].lower() + or command_name == command["name"].lower() + ): + return command["function"](**arguments) + + raise RuntimeError( + f"Cannot execute '{command_name}': unknown command." + " Do not try to use this command again." + ) + except Exception as e: + return f"Error: {str(e)}" diff --git a/autogpt/commands/decorators.py b/autogpt/commands/decorators.py index 3528af04b..b63c76d53 100644 --- a/autogpt/commands/decorators.py +++ b/autogpt/commands/decorators.py @@ -2,7 +2,7 @@ import functools from pathlib import Path from typing import Callable -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.logs import logger diff --git a/autogpt/commands/execute_code.py b/autogpt/commands/execute_code.py index aad93193e..2403b2ba5 100644 --- a/autogpt/commands/execute_code.py +++ b/autogpt/commands/execute_code.py @@ -7,7 +7,7 @@ import docker from docker.errors import DockerException, ImageNotFound from docker.models.containers import Container as DockerContainer -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.command_decorator import command from autogpt.config import Config from autogpt.logs import logger diff --git a/autogpt/commands/file_operations.py b/autogpt/commands/file_operations.py index 0a06da318..939b7dc18 100644 --- a/autogpt/commands/file_operations.py +++ b/autogpt/commands/file_operations.py @@ -8,7 +8,7 @@ import os.path from pathlib import Path from typing import Generator, Literal -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.command_decorator import command from autogpt.logs import logger from autogpt.memory.vector import MemoryItem, VectorMemory diff --git a/autogpt/commands/git_operations.py b/autogpt/commands/git_operations.py index 276031f78..021157fbb 100644 --- a/autogpt/commands/git_operations.py +++ b/autogpt/commands/git_operations.py @@ -2,7 +2,7 @@ from git.repo import Repo -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.command_decorator import command from autogpt.url_utils.validators import validate_url diff --git a/autogpt/commands/image_gen.py b/autogpt/commands/image_gen.py index b1a89b289..abae6149e 100644 --- a/autogpt/commands/image_gen.py +++ b/autogpt/commands/image_gen.py @@ -9,7 +9,7 @@ import openai import requests from PIL import Image -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.command_decorator import command from autogpt.logs import logger diff --git a/autogpt/commands/task_statuses.py b/autogpt/commands/task_statuses.py index 062ebe3a4..34908928f 100644 --- a/autogpt/commands/task_statuses.py +++ b/autogpt/commands/task_statuses.py @@ -3,7 +3,7 @@ from __future__ import annotations from typing import NoReturn -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.command_decorator import command from autogpt.logs import logger diff --git a/autogpt/commands/web_search.py b/autogpt/commands/web_search.py index d47d680b2..9ea0d2061 100644 --- a/autogpt/commands/web_search.py +++ b/autogpt/commands/web_search.py @@ -7,7 +7,7 @@ from itertools import islice from duckduckgo_search import DDGS -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.command_decorator import command DUCKDUCKGO_MAX_ATTEMPTS = 3 diff --git a/autogpt/commands/web_selenium.py b/autogpt/commands/web_selenium.py index 821957f3e..948d799e9 100644 --- a/autogpt/commands/web_selenium.py +++ b/autogpt/commands/web_selenium.py @@ -27,7 +27,7 @@ from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.firefox import GeckoDriverManager from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.command_decorator import command from autogpt.logs import logger from autogpt.memory.vector import MemoryItem, get_memory diff --git a/autogpt/llm/chat.py b/autogpt/llm/chat.py index 4364cb1d8..f08fdab4e 100644 --- a/autogpt/llm/chat.py +++ b/autogpt/llm/chat.py @@ -4,7 +4,7 @@ import time from typing import TYPE_CHECKING if TYPE_CHECKING: - from autogpt.agent.agent import Agent + from autogpt.agents.agent import Agent from autogpt.config import Config from autogpt.llm.api_manager import ApiManager diff --git a/autogpt/main.py b/autogpt/main.py index 4ef3fc949..0da2d193b 100644 --- a/autogpt/main.py +++ b/autogpt/main.py @@ -6,7 +6,7 @@ from typing import Optional from colorama import Fore, Style -from autogpt.agent import Agent +from autogpt.agents import Agent from autogpt.config.config import ConfigBuilder, check_openai_api_key from autogpt.configurator import create_config from autogpt.logs import logger diff --git a/autogpt/memory/message_history.py b/autogpt/memory/message_history.py index 30dbbb809..c718f2edb 100644 --- a/autogpt/memory/message_history.py +++ b/autogpt/memory/message_history.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Optional if TYPE_CHECKING: - from autogpt.agent import Agent + from autogpt.agents import Agent from autogpt.config import Config from autogpt.json_utils.utilities import extract_json_from_response diff --git a/benchmarks.py b/benchmarks.py index cb592be8a..2e143f9d6 100644 --- a/benchmarks.py +++ b/benchmarks.py @@ -1,4 +1,4 @@ -from autogpt.agent import Agent +from autogpt.agents import Agent from autogpt.config import AIConfig, Config, ConfigBuilder from autogpt.main import COMMAND_CATEGORIES from autogpt.memory.vector import get_memory diff --git a/tests/challenges/debug_code/test_debug_code_challenge_a.py b/tests/challenges/debug_code/test_debug_code_challenge_a.py index 90a7084dc..c846f9ce5 100644 --- a/tests/challenges/debug_code/test_debug_code_challenge_a.py +++ b/tests/challenges/debug_code/test_debug_code_challenge_a.py @@ -3,7 +3,7 @@ from pathlib import Path import pytest from pytest_mock import MockerFixture -from autogpt.agent import Agent +from autogpt.agents import Agent from autogpt.commands.execute_code import execute_python_file from autogpt.workspace import Workspace from tests.challenges.challenge_decorator.challenge_decorator import challenge diff --git a/tests/conftest.py b/tests/conftest.py index 64e840247..09d358e69 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ import pytest import yaml from pytest_mock import MockerFixture -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.config import AIConfig, Config, ConfigBuilder from autogpt.config.ai_config import AIConfig from autogpt.llm.api_manager import ApiManager diff --git a/tests/integration/agent_factory.py b/tests/integration/agent_factory.py index 664c6cbb4..d3832c27a 100644 --- a/tests/integration/agent_factory.py +++ b/tests/integration/agent_factory.py @@ -1,6 +1,6 @@ import pytest -from autogpt.agent import Agent +from autogpt.agents import Agent from autogpt.config import AIConfig, Config from autogpt.memory.vector import get_memory from autogpt.models.command_registry import CommandRegistry diff --git a/tests/integration/test_execute_code.py b/tests/integration/test_execute_code.py index b7be8622a..80010c6f2 100644 --- a/tests/integration/test_execute_code.py +++ b/tests/integration/test_execute_code.py @@ -7,7 +7,7 @@ import tempfile import pytest import autogpt.commands.execute_code as sut # system under testing -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.config import Config diff --git a/tests/integration/test_image_gen.py b/tests/integration/test_image_gen.py index 8cdcfd986..0a9f68978 100644 --- a/tests/integration/test_image_gen.py +++ b/tests/integration/test_image_gen.py @@ -6,7 +6,7 @@ from unittest.mock import patch import pytest from PIL import Image -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.commands.image_gen import generate_image, generate_image_with_sd_webui diff --git a/tests/integration/test_web_selenium.py b/tests/integration/test_web_selenium.py index e900b4b3f..43de2860e 100644 --- a/tests/integration/test_web_selenium.py +++ b/tests/integration/test_web_selenium.py @@ -1,7 +1,7 @@ import pytest from pytest_mock import MockerFixture -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.commands.web_selenium import browse_website diff --git a/tests/unit/test_agent.py b/tests/unit/test_agent.py index 351454be0..7baeeb64f 100644 --- a/tests/unit/test_agent.py +++ b/tests/unit/test_agent.py @@ -1,4 +1,4 @@ -from autogpt.agent.agent import Agent, execute_command +from autogpt.agents.agent import Agent, execute_command def test_agent_initialization(agent: Agent): diff --git a/tests/unit/test_file_operations.py b/tests/unit/test_file_operations.py index f9c571d8c..d7d870a59 100644 --- a/tests/unit/test_file_operations.py +++ b/tests/unit/test_file_operations.py @@ -12,7 +12,7 @@ import pytest from pytest_mock import MockerFixture import autogpt.commands.file_operations as file_ops -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.config import Config from autogpt.memory.vector.memory_item import MemoryItem from autogpt.memory.vector.utils import Embedding diff --git a/tests/unit/test_git_commands.py b/tests/unit/test_git_commands.py index a6defdfc3..9f56a3840 100644 --- a/tests/unit/test_git_commands.py +++ b/tests/unit/test_git_commands.py @@ -2,7 +2,7 @@ import pytest from git.exc import GitCommandError from git.repo.base import Repo -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.commands.git_operations import clone_repository diff --git a/tests/unit/test_message_history.py b/tests/unit/test_message_history.py index 9b275252c..ec01cd558 100644 --- a/tests/unit/test_message_history.py +++ b/tests/unit/test_message_history.py @@ -4,7 +4,7 @@ from unittest.mock import MagicMock import pytest -from autogpt.agent import Agent +from autogpt.agents import Agent from autogpt.config import AIConfig from autogpt.config.config import Config from autogpt.llm.base import ChatModelResponse, ChatSequence, Message diff --git a/tests/unit/test_web_search.py b/tests/unit/test_web_search.py index 4f5143069..790b1c2f6 100644 --- a/tests/unit/test_web_search.py +++ b/tests/unit/test_web_search.py @@ -3,7 +3,7 @@ import json import pytest from googleapiclient.errors import HttpError -from autogpt.agent.agent import Agent +from autogpt.agents.agent import Agent from autogpt.commands.web_search import google, safe_google_results, web_search -- cgit v1.2.3 From a0f5aa942de3a94ce6a173c784d7a73b17a134a6 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Thu, 13 Jul 2023 18:35:50 +0200 Subject: Fix Netlify preview builds --- netlify.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/netlify.toml b/netlify.toml index 43e79f0fd..de261908f 100644 --- a/netlify.toml +++ b/netlify.toml @@ -3,4 +3,4 @@ [build] publish = "public/" command = "mkdocs build -d public" - ignore = "git diff --quiet HEAD^ HEAD docs mkdocs.yml CONTRIBUTING.md CODE_OF_CONDUCT.md LICENSE" + ignore = "git diff --quiet $CACHED_COMMIT_REF $COMMIT_REF docs mkdocs.yml CONTRIBUTING.md CODE_OF_CONDUCT.md LICENSE" -- cgit v1.2.3 From ed12b2c7d6036e16ac49123d62963f0819861fdd Mon Sep 17 00:00:00 2001 From: Antonov Maxim <99024963+antonovmaxim@users.noreply.github.com> Date: Thu, 13 Jul 2023 20:21:25 +0300 Subject: Allow absolute paths when not restricting to workspace root (#4946) * restrict_to_root fix * Fix formatting --------- Co-authored-by: Reinier van der Leer --- autogpt/workspace/workspace.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/autogpt/workspace/workspace.py b/autogpt/workspace/workspace.py index 6d90f8540..07186e735 100644 --- a/autogpt/workspace/workspace.py +++ b/autogpt/workspace/workspace.py @@ -123,7 +123,11 @@ class Workspace: logger.debug(f"Resolved root as '{root}'") # Allow exception for absolute paths if they are contained in your workspace directory. - if relative_path.is_absolute() and not relative_path.is_relative_to(root): + if ( + relative_path.is_absolute() + and restrict_to_root + and not relative_path.is_relative_to(root) + ): raise ValueError( f"Attempted to access absolute path '{relative_path}' in workspace '{root}'." ) -- cgit v1.2.3 From 7c4fc45b4a536e6423b6491c34953a2516863d78 Mon Sep 17 00:00:00 2001 From: Vasek Mlejnsky Date: Thu, 13 Jul 2023 18:14:57 -0600 Subject: Add initial share logs page (#4965) * Add initial share logs page * Fix title sizes * Update share_logs.md Added some text for reasons to share logs * Add section on how to share logs using e2b * Fix path to images with sizes * Fix paths to images in docs * Fix formatting * Fix formatting * Fix grammar * Make position in menu more prominent * original log directory was incorrect I took the directory from usage.md but that is incorrect * Updated the directory for the logs Updated the directory for the logs * added some text and made it pretty --------- Co-authored-by: NeonN3mesis <129052650+NeonN3mesis@users.noreply.github.com> Co-authored-by: Reinier van der Leer --- docs/imgs/e2b-dashboard.png | Bin 0 -> 515634 bytes docs/imgs/e2b-log-url.png | Bin 0 -> 43687 bytes docs/imgs/e2b-new-tag.png | Bin 0 -> 47736 bytes docs/imgs/e2b-tag-button.png | Bin 0 -> 20635 bytes docs/share-your-logs.md | 52 +++++++++++++++++++++++++++++++++++++++++++ docs/usage.md | 12 ++++++++-- mkdocs.yml | 3 ++- 7 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 docs/imgs/e2b-dashboard.png create mode 100644 docs/imgs/e2b-log-url.png create mode 100644 docs/imgs/e2b-new-tag.png create mode 100644 docs/imgs/e2b-tag-button.png create mode 100644 docs/share-your-logs.md diff --git a/docs/imgs/e2b-dashboard.png b/docs/imgs/e2b-dashboard.png new file mode 100644 index 000000000..456f1490c Binary files /dev/null and b/docs/imgs/e2b-dashboard.png differ diff --git a/docs/imgs/e2b-log-url.png b/docs/imgs/e2b-log-url.png new file mode 100644 index 000000000..3f1c189ee Binary files /dev/null and b/docs/imgs/e2b-log-url.png differ diff --git a/docs/imgs/e2b-new-tag.png b/docs/imgs/e2b-new-tag.png new file mode 100644 index 000000000..65a0a767c Binary files /dev/null and b/docs/imgs/e2b-new-tag.png differ diff --git a/docs/imgs/e2b-tag-button.png b/docs/imgs/e2b-tag-button.png new file mode 100644 index 000000000..741a6bac1 Binary files /dev/null and b/docs/imgs/e2b-tag-button.png differ diff --git a/docs/share-your-logs.md b/docs/share-your-logs.md new file mode 100644 index 000000000..f673e375c --- /dev/null +++ b/docs/share-your-logs.md @@ -0,0 +1,52 @@ +## Share your logs with us to help improve Auto-GPT + +Do you notice weird behavior with your agent? Do you have an interesting use case? Do you have a bug you want to report? +Follow the steps below to enable your logs and upload them. You can include these logs when making an issue report or discussing an issue with us. + +### Enable Debug Logs +Activity, Error, and Debug logs are located in `./logs` + +To print out debug logs: + +``` shell +./run.sh --debug # on Linux / macOS + +.\run.bat --debug # on Windows + +docker-compose run --rm auto-gpt --debug # in Docker +``` + +### Inspect and share logs +You can inspect and share logs via [e2b](https://e2b.dev). +![E2b logs dashboard](./imgs/e2b-dashboard.png) + + + +1. Go to [autogpt.e2b.dev](https://autogpt.e2b.dev) and sign in. +2. You'll see logs from other members of the AutoGPT team that you can inspect. +3. Or you upload your own logs. Click on the "Upload log folder" button and select the debug logs dir that you generated. Wait a 1-2 seconds and the page reloads. +4. You can share logs via sharing the URL in your browser. +![E2b log URL](./imgs/e2b-log-url.png) + + +### Add tags to logs +You can add custom tags to logs for other members of your team. This is useful if you want to indicate that the agent is for example having issues with challenges. + +E2b offers 3 types of severity: + +- Success +- Warning +- Error + +You can name your tag any way you want. + +#### How to add a tag +1. Click on the "plus" button on the left from the logs folder name. + +![E2b tag button](./imgs/e2b-tag-button.png) + +2. Type the name of a new tag. + +3. Select the severity. + +![E2b new tag](./imgs/e2b-new-tag.png) diff --git a/docs/usage.md b/docs/usage.md index a9ef2883e..cb74ef7f6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -89,12 +89,20 @@ This may give your bot increased intelligence. ## Logs -Activity and error logs are located in the `./output/logs` +Activity, Error, and Debug logs are located in `./logs` + +!!! tip + Do you notice weird behavior with your agent? Do you have an interesting use case? Do you have a bug you want to report? + Follow the step below to enable your logs. You can include these logs when making an issue report or discussing an issue with us. To print out debug logs: ``` shell -./run.sh --debug +./run.sh --debug # on Linux / macOS + +.\run.bat --debug # on Windows + +docker-compose run --rm auto-gpt --debug # in Docker ``` ## Disabling Command Categories diff --git a/mkdocs.yml b/mkdocs.yml index 50e062571..a85004453 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,7 +12,8 @@ nav: - Voice: configuration/voice.md - Image Generation: configuration/imagegen.md - - Contributing: + - Help us improve Auto-GPT: + - Share your debug logs with us: share-your-logs.md - Contribution guide: contributing.md - Running tests: testing.md - Code of Conduct: code-of-conduct.md -- cgit v1.2.3 From 3a9dfa4c594dd7628de0a1b4bb9e3a15f1c1f172 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Thu, 13 Jul 2023 20:47:55 -0700 Subject: Update submodules and upload artifacts (#97) Signed-off-by: Merwane Hamadi --- .github/workflows/ci.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 34eedb292..0e9263861 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,15 +135,15 @@ jobs: fi pip install ../../dist/*.whl - - if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then - curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start - agbenchmark start --maintain - else + + if [ "${GITHUB_EVENT_NAME}" == "pull_request" ]; then agbenchmark start --maintain --mock agbenchmark start --improve --mock agbenchmark start --mock agbenchmark start --mock --category=retrieval + else + curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start + agbenchmark start --maintain fi env: GITHUB_EVENT_NAME: ${{ github.event_name }} @@ -155,9 +155,10 @@ jobs: HELICONE_CACHE_ENABLED: true HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} - - name: Upload logs as artifact + + - name: Upload reports if: always() uses: actions/upload-artifact@v3 with: - name: gpt-engineer-projects - path: agent/gpt-engineer/projects + name: ${{ matrix.agent-name }} + path: agent/${{ matrix.agent-name }}/agbenchmark -- cgit v1.2.3 From 376ecf0c5f1180739b94b68b4ee5cdcd7dca2f09 Mon Sep 17 00:00:00 2001 From: GECORegulatory <121075828+GECORegulatory@users.noreply.github.com> Date: Fri, 14 Jul 2023 12:24:49 -0400 Subject: Replaced Fictitious color name Fore.ORANGE (#4972) Changed Colorama.Fore.ORANGE to YELLOW in config.py, As Colorama does not support an ORANGE color. This fixes a fatal error in run.sh when trying to set the API key through the input() method. Co-authored-by: James Collins --- autogpt/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autogpt/config/config.py b/autogpt/config/config.py index ae2f7bedc..cb3f26d3e 100644 --- a/autogpt/config/config.py +++ b/autogpt/config/config.py @@ -367,7 +367,7 @@ def check_openai_api_key(config: Config) -> None: print( Fore.GREEN + "OpenAI API key successfully set!\n" - + Fore.ORANGE + + Fore.YELLOW + "NOTE: The API key you've set is only temporary.\n" + "For longer sessions, please set it in .env file" + Fore.RESET -- cgit v1.2.3 From a9702e4629d4b1d90d118b4dabbbb665f5635e97 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 14 Jul 2023 10:27:48 -0700 Subject: Add basic code generation challenge (#98) --- .github/workflows/ci.yml | 4 +++ agbenchmark/challenge.py | 7 +++++ agbenchmark/challenges/README.md | 16 +++++++++++ .../challenges/code/d4/artifacts_out/__init__.py | 0 .../challenges/code/d4/artifacts_out/code.py | 12 +++++++++ agbenchmark/challenges/code/d4/data.json | 18 +++++++++++++ .../challenges/code/d4/hidden_files/test.py | 31 ++++++++++++++++++++++ agent/gpt-engineer | 2 +- pyproject.toml | 4 +-- 9 files changed, 91 insertions(+), 3 deletions(-) create mode 100644 agbenchmark/challenges/code/d4/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d4/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d4/data.json create mode 100644 agbenchmark/challenges/code/d4/hidden_files/test.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0e9263861..b7864db6a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,6 +141,10 @@ jobs: agbenchmark start --improve --mock agbenchmark start --mock agbenchmark start --mock --category=retrieval + agbenchmark start --mock --category=interface + agbenchmark start --mock --category=code + agbenchmark start --mock --category=memory + agbenchmark start --mock --category=iterate else curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start agbenchmark start --maintain diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index aeebd7ad8..874fd45bd 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -46,6 +46,13 @@ class Challenge(ABC): run_agent(self.task, config, self.CHALLENGE_LOCATION) + # hidden files are added after the agent runs. Hidden files can be python test files. + # We copy them in the workspace to make it easy to import the code produced by the agent + + copy_artifacts_into_workspace( + config["workspace"], "hidden_files", self.CHALLENGE_LOCATION + ) + def test_method(self, config: Dict[str, Any]) -> None: raise NotImplementedError diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index 305cd28f1..a890c9d36 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -46,3 +46,19 @@ Example: Current Output: - **score** (float): scores range from [0, 1] + +## Add files to challenges: + +### artifacts_in + +This folder contains all the files you want the agent to have in its workspace BEFORE the challenge starts + +### artifacts_out +This folder contains all the files you would like the agent to generate. This folder is used to mock the agent. +This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works. + +### hidden_files +This folder contains files hidden from the agent but useful to assess whether a challenge is successful. +For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge. +This allows us to run this test.py and easily import code generated by the agent. +For example see: TestBasicCodeGeneration challenge. diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d4/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d4/artifacts_out/code.py new file mode 100644 index 000000000..de3d8c62c --- /dev/null +++ b/agbenchmark/challenges/code/d4/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json new file mode 100644 index 000000000..51f6f2702 --- /dev/null +++ b/agbenchmark/challenges/code/d4/data.json @@ -0,0 +1,18 @@ +{ + "name": "TestBasicCodeGeneration", + "category": ["code", "iterate"], + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "dependencies": ["TestWriteFile"], + "ground": { + "answer": "The two_sum function coded properly.", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "novice", + "description": "Tests ability for the agent to create the two_sum function.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/code/d4/hidden_files/test.py b/agbenchmark/challenges/code/d4/hidden_files/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d4/hidden_files/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 521d626c0..bca191cd7 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 521d626c0075ed6545f01b771757c856f8addbd6 +Subproject commit bca191cd76cdea0335da91d004c64d9bb8520fea diff --git a/pyproject.toml b/pyproject.toml index b0526ab57..48be9cf5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,10 +38,10 @@ testpaths = [ ] markers = [ "retrieval", - "regression", "interface", "code", - "memory" + "memory", + "iterate" ] [tool.poetry.scripts] -- cgit v1.2.3 From 7bc7d9213df32cabf8e96f422741c037b7817487 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 14 Jul 2023 14:39:47 -0700 Subject: Replace hidden files with custom python (#99) Signed-off-by: Merwane Hamadi --- agbenchmark/challenge.py | 2 +- agbenchmark/challenges/README.md | 9 +++---- .../challenges/code/d3/custom_python/api_tests.py | 14 ++++++++++ agbenchmark/challenges/code/d3/data.json | 6 ++--- .../challenges/code/d4/custom_python/test.py | 31 ++++++++++++++++++++++ .../challenges/code/d4/hidden_files/test.py | 31 ---------------------- agbenchmark/challenges/test_all.py | 20 +------------- 7 files changed, 54 insertions(+), 59 deletions(-) create mode 100644 agbenchmark/challenges/code/d4/custom_python/test.py delete mode 100644 agbenchmark/challenges/code/d4/hidden_files/test.py diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index 874fd45bd..f07faf8ee 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -50,7 +50,7 @@ class Challenge(ABC): # We copy them in the workspace to make it easy to import the code produced by the agent copy_artifacts_into_workspace( - config["workspace"], "hidden_files", self.CHALLENGE_LOCATION + config["workspace"], "custom_python", self.CHALLENGE_LOCATION ) def test_method(self, config: Dict[str, Any]) -> None: diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md index a890c9d36..34e35154e 100644 --- a/agbenchmark/challenges/README.md +++ b/agbenchmark/challenges/README.md @@ -57,8 +57,7 @@ This folder contains all the files you want the agent to have in its workspace B This folder contains all the files you would like the agent to generate. This folder is used to mock the agent. This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works. -### hidden_files -This folder contains files hidden from the agent but useful to assess whether a challenge is successful. -For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge. -This allows us to run this test.py and easily import code generated by the agent. -For example see: TestBasicCodeGeneration challenge. +### custom_python +This folder contains files that will be copied into the agent's workspace and run after the challenge is completed. +For example we can have a test.py in it and run this file in the workspace to easily import code generated by the agent. +Example: TestBasicCodeGeneration challenge. diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d3/custom_python/api_tests.py index 1d6255ebd..f01934ef8 100644 --- a/agbenchmark/challenges/code/d3/custom_python/api_tests.py +++ b/agbenchmark/challenges/code/d3/custom_python/api_tests.py @@ -5,6 +5,15 @@ from unittest.mock import Mock, patch import requests +def test_make_request_and_assert() -> None: + result = make_request_and_assert() + print(result) + expected_result = {"status": "OK"} + error_message = f"AssertionError: Expected the output to be {expected_result}" + print(error_message) + assert result == expected_result, error_message + + def make_assertion() -> None: if os.environ.get("MOCK_TEST", "False").lower() == "true": mock_response = Mock(requests.Response) @@ -25,3 +34,8 @@ def make_request_and_assert() -> Dict[str, Any]: ) return response.json() + + +if __name__ == "__main__": + # test for the case when server is healthy + test_make_request_and_assert() diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json index 94c81664c..ae0e45581 100644 --- a/agbenchmark/challenges/code/d3/data.json +++ b/agbenchmark/challenges/code/d3/data.json @@ -6,9 +6,9 @@ "ground": { "answer": "GET localhost:8079/health responds with a 200 OK", "should_contain": [], - "should_not_contain": [], - "files": [], - "type": "custom_python" + "should_not_contain": ["AssertionError"], + "files": ["test.py"], + "type": "execute_python_code" }, "info": { "difficulty": "advanced", diff --git a/agbenchmark/challenges/code/d4/custom_python/test.py b/agbenchmark/challenges/code/d4/custom_python/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d4/custom_python/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d4/hidden_files/test.py b/agbenchmark/challenges/code/d4/hidden_files/test.py deleted file mode 100644 index d85d13537..000000000 --- a/agbenchmark/challenges/code/d4/hidden_files/test.py +++ /dev/null @@ -1,31 +0,0 @@ -# mypy: ignore-errors -from code import two_sum -from typing import List - - -def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: - result = two_sum(nums, target) - print(result) - assert ( - result == expected_result - ), f"AssertionError: Expected the output to be {expected_result}" - - -if __name__ == "__main__": - # test the trivial case with the first two numbers - nums = [2, 7, 11, 15] - target = 9 - expected_result = [0, 1] - test_two_sum(nums, target, expected_result) - - # test for ability to use zero and the same number twice - nums = [2, 7, 0, 15, 12, 0] - target = 0 - expected_result = [2, 5] - test_two_sum(nums, target, expected_result) - - # test for first and last index usage and negative numbers - nums = [-6, 7, 11, 4] - target = -2 - expected_result = [0, 3] - test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index a5afef96c..98a5ab81a 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -2,8 +2,6 @@ import glob import importlib import json import os -import pkgutil -import sys import types from pathlib import Path from typing import Any, Dict @@ -60,23 +58,7 @@ def generate_tests() -> None: scores = self.get_scores(config) - # Check if make_assertion is defined and use it - if self.data.ground.type == "custom_python": - custom_python_location = ( - f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python" - ) - sys.path.append(str(custom_python_location)) - - for module_loader, name, ispkg in pkgutil.iter_modules( - [str(custom_python_location)] - ): - module = importlib.import_module(name) - - if hasattr(module, "make_assertion"): - make_assertion = getattr(module, "make_assertion") - make_assertion() - else: - assert 1 in scores + assert 1 in scores # Parametrize the method here test_method = pytest.mark.parametrize( -- cgit v1.2.3 From 281cb0ef37c3b8934af787f6681858b0c472556b Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 14 Jul 2023 14:56:56 -0700 Subject: Start showing benchmark results (#100) --- README.md | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ed348b5ab..e73f39891 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,26 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work -### Scores: +## Scores: +Spider chart for each agent coming soon ! -Scoring of agents will go here. Both overall and by category. +## Detailed results +:warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon. -### Integrated Agents +### Auto-GPT +Coming Soon! -- Auto-GPT -- gpt-engineer -- mini-agi -- smol-developer +### gpt-engineer + +| Task | Results | +|-----------------------------------|----------------------| +| Debug Simple Typo With Guidance | :x: | +| Debug Simple Typo Without Guidance| :x: | +| Basic Code Generation | :white_check_mark: | +| Create Simple Web Server | :x: | + +### mini-agi +Coming Soon! + +### smol-developer +Coming Soon! -- cgit v1.2.3 From c821b294c619c604099b7a9497cc967d9e65ca29 Mon Sep 17 00:00:00 2001 From: ido777 Date: Sat, 15 Jul 2023 01:23:59 +0300 Subject: Fix orjson encoding text with UTF-8 surrogates (#3666) * added lib ftfy (fixes text for you), to solve surrogates errors --------- Co-authored-by: Reinier van der Leer --- autogpt/memory/vector/memory_item.py | 4 ++++ requirements.txt | 1 + 2 files changed, 5 insertions(+) diff --git a/autogpt/memory/vector/memory_item.py b/autogpt/memory/vector/memory_item.py index 587a915b4..f7a7fe6e8 100644 --- a/autogpt/memory/vector/memory_item.py +++ b/autogpt/memory/vector/memory_item.py @@ -4,6 +4,7 @@ import dataclasses import json from typing import Literal +import ftfy import numpy as np from autogpt.config import Config @@ -43,6 +44,9 @@ class MemoryItem: ): logger.debug(f"Memorizing text:\n{'-'*32}\n{text}\n{'-'*32}\n") + # Fix encoding, e.g. removing unicode surrogates (see issue #778) + text = ftfy.fix_text(text) + chunks = [ chunk for chunk, _ in ( diff --git a/requirements.txt b/requirements.txt index 47aa08a69..4af8bccd9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ google-api-python-client #(https://developers.google.com/custom-search/v1/overvi pinecone-client==2.2.1 redis orjson==3.8.10 +ftfy>=6.1.1 Pillow selenium==4.1.4 webdriver-manager -- cgit v1.2.3 From 7de965ab3fa77c724458512053993ce16c3d600f Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 14 Jul 2023 18:04:35 -0700 Subject: Show Auto-GPT results (#102) --- README.md | 36 +++++++++++++++++++++++++++++++++++- agent/Auto-GPT | 2 +- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e73f39891..fa06317c5 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,44 @@ Spider chart for each agent coming soon ! :warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon. ### Auto-GPT -Coming Soon! +Interface + +| Task | Results | +|--------------|---------------------| +| Write File | :white_check_mark: | +| Read File | :white_check_mark: | +| Search File | :x: | + +Code + +| Task | Results | +|-----------------------------------|----------------------| +| Debug Simple Typo With Guidance | :x: | +| Debug Simple Typo Without Guidance| :x: | +| Basic Code Generation | :white_check_mark: | +| Create Simple Web Server | :x: | + +Memory + +| Task | Results | +|--------------------------------------------|--------------------| +| Basic Memory | :white_check_mark: | +| Remember Multiple Ids | :x: | +| Remember Multiple Ids With Noise | :x: | +| Remember Multiple Phrases With Noise | :x: | ### gpt-engineer +Interface + +| Task | Results | +|-------------|--------------------| +| Write File | :white_check_mark: | +| Read File | :white_check_mark: | +| Search File | :x: | + +Code + | Task | Results | |-----------------------------------|----------------------| | Debug Simple Typo With Guidance | :x: | diff --git a/agent/Auto-GPT b/agent/Auto-GPT index d4fc134f8..9079f6641 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit d4fc134f8c4bd7b63f283f932f68932317f53f78 +Subproject commit 9079f66417f2480d0f5764fb0f916d3241b3fae8 -- cgit v1.2.3 From 5ae044f53db4af1b8a54ef8c7e2afb17e67568b9 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Sat, 15 Jul 2023 09:10:32 +0800 Subject: Integrate `plugin.handle_text_embedding` hook (#2804) * add feature custom text embedding in plugin * black code format * _get_embedding_with_plugin() * Fix docstring & type hint --------- Co-authored-by: Reinier van der Leer --- autogpt/memory/vector/utils.py | 22 ++++++++++++++++++++-- autogpt/models/base_open_ai_plugin.py | 12 +++++++----- tests/unit/models/test_base_open_api_plugin.py | 2 ++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/autogpt/memory/vector/utils.py b/autogpt/memory/vector/utils.py index eb6912566..1b050d562 100644 --- a/autogpt/memory/vector/utils.py +++ b/autogpt/memory/vector/utils.py @@ -1,3 +1,4 @@ +from contextlib import suppress from typing import Any, overload import numpy as np @@ -12,12 +13,12 @@ Embedding = list[np.float32] | np.ndarray[Any, np.dtype[np.float32]] @overload -def get_embedding(input: str | TText) -> Embedding: +def get_embedding(input: str | TText, config: Config) -> Embedding: ... @overload -def get_embedding(input: list[str] | list[TText]) -> list[Embedding]: +def get_embedding(input: list[str] | list[TText], config: Config) -> list[Embedding]: ... @@ -37,9 +38,16 @@ def get_embedding( if isinstance(input, str): input = input.replace("\n", " ") + + with suppress(NotImplementedError): + return _get_embedding_with_plugin(input, config) + elif multiple and isinstance(input[0], str): input = [text.replace("\n", " ") for text in input] + with suppress(NotImplementedError): + return [_get_embedding_with_plugin(i, config) for i in input] + model = config.embedding_model kwargs = {"model": model} kwargs.update(config.get_openai_credentials(model)) @@ -62,3 +70,13 @@ def get_embedding( embeddings = sorted(embeddings, key=lambda x: x["index"]) return [d["embedding"] for d in embeddings] + + +def _get_embedding_with_plugin(text: str, config: Config) -> Embedding: + for plugin in config.plugins: + if plugin.can_handle_text_embedding(text): + embedding = plugin.handle_text_embedding(text) + if embedding is not None: + return embedding + + raise NotImplementedError diff --git a/autogpt/models/base_open_ai_plugin.py b/autogpt/models/base_open_ai_plugin.py index c0aac8ed2..60f6f91bf 100644 --- a/autogpt/models/base_open_ai_plugin.py +++ b/autogpt/models/base_open_ai_plugin.py @@ -198,18 +198,20 @@ class BaseOpenAIPlugin(AutoGPTPluginTemplate): def can_handle_text_embedding(self, text: str) -> bool: """This method is called to check that the plugin can handle the text_embedding method. + Args: text (str): The text to be convert to embedding. - Returns: - bool: True if the plugin can handle the text_embedding method.""" + Returns: + bool: True if the plugin can handle the text_embedding method.""" return False - def handle_text_embedding(self, text: str) -> list: - """This method is called when the chat completion is done. + def handle_text_embedding(self, text: str) -> list[float]: + """This method is called to create a text embedding. + Args: text (str): The text to be convert to embedding. Returns: - list: The text embedding. + list[float]: The created embedding vector. """ def can_handle_user_input(self, user_input: str) -> bool: diff --git a/tests/unit/models/test_base_open_api_plugin.py b/tests/unit/models/test_base_open_api_plugin.py index 4d41eddd3..e656f4643 100644 --- a/tests/unit/models/test_base_open_api_plugin.py +++ b/tests/unit/models/test_base_open_api_plugin.py @@ -54,6 +54,7 @@ def test_dummy_plugin_default_methods(dummy_plugin): assert not dummy_plugin.can_handle_pre_command() assert not dummy_plugin.can_handle_post_command() assert not dummy_plugin.can_handle_chat_completion(None, None, None, None) + assert not dummy_plugin.can_handle_text_embedding(None) assert dummy_plugin.on_response("hello") == "hello" assert dummy_plugin.post_prompt(None) is None @@ -77,3 +78,4 @@ def test_dummy_plugin_default_methods(dummy_plugin): assert isinstance(post_command, str) assert post_command == "upgraded successfully!" assert dummy_plugin.handle_chat_completion(None, None, None, None) is None + assert dummy_plugin.handle_text_embedding(None) is None -- cgit v1.2.3 From 66fc7ccb31e88432abf1845a439210a36dd232cd Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 14 Jul 2023 18:26:17 -0700 Subject: Display smol-developer-results (#103) --- README.md | 21 ++++++++++++++++++--- agent/smol-developer | 2 +- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fa06317c5..727fefa41 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work ## Scores: -Spider chart for each agent coming soon ! +Radio chart for each agent coming soon ! ## Detailed results :warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon. @@ -42,7 +42,7 @@ Interface | Task | Results | |-------------|--------------------| | Write File | :white_check_mark: | -| Read File | :white_check_mark: | +| Read File | :x: | | Search File | :x: | Code @@ -58,4 +58,19 @@ Code Coming Soon! ### smol-developer -Coming Soon! +Interface + +| Task | Results | +|-------------|--------------------| +| Write File | :white_check_mark: | +| Read File | :x: | +| Search File | :x: | + +Code + +| Task | Results | +|-----------------------------------|----------------------| +| Debug Simple Typo With Guidance | :x: | +| Debug Simple Typo Without Guidance| :x: | +| Basic Code Generation | :white_check_mark: | +| Create Simple Web Server | :x: | diff --git a/agent/smol-developer b/agent/smol-developer index aa8233925..f4f439551 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit aa8233925090c0c9314ceef68397ab37baf17766 +Subproject commit f4f4395511ed6ba59ec09100d6596bf81d68a898 -- cgit v1.2.3 From 8be2a0b2e13972ed042485f3eca551b794434881 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Fri, 14 Jul 2023 18:45:24 -0700 Subject: Display results per category (#104) --- README.md | 67 +++++++++++++-------------------------------------------------- 1 file changed, 14 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 727fefa41..3011adedc 100644 --- a/README.md +++ b/README.md @@ -8,69 +8,30 @@ Radio chart for each agent coming soon ! ## Detailed results :warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon. -### Auto-GPT Interface -| Task | Results | -|--------------|---------------------| -| Write File | :white_check_mark: | -| Read File | :white_check_mark: | -| Search File | :x: | +| Task | Auto-GPT | gpt-engineer | mini-agi | smol-developer | +|--------------|--------------------|--------------------|----------|--------------------| +| Write File | :white_check_mark: | :white_check_mark: | tbd | :white_check_mark: | +| Read File | :white_check_mark: | :x: | tbd | :x: | +| Search File | :x: | :x: | tbd | :x: | + Code -| Task | Results | -|-----------------------------------|----------------------| -| Debug Simple Typo With Guidance | :x: | -| Debug Simple Typo Without Guidance| :x: | -| Basic Code Generation | :white_check_mark: | -| Create Simple Web Server | :x: | +| Task | Auto-GPT | gpt-engineer | mini-agi | smol-developer | +|------------------------------------|--------------------|--------------------|----------|--------------------| +| Debug Simple Typo With Guidance | :x: | :x: | tbd | :x: | +| Debug Simple Typo Without Guidance | :x: | :x: | tbd | :x: | +| Basic Code Generation | :white_check_mark: | :white_check_mark: | tbd | :white_check_mark: | +| Create Simple Web Server | :x: | :x: | tbd | :x: | + Memory -| Task | Results | +| Task | Auto-GPT | |--------------------------------------------|--------------------| | Basic Memory | :white_check_mark: | | Remember Multiple Ids | :x: | | Remember Multiple Ids With Noise | :x: | | Remember Multiple Phrases With Noise | :x: | - -### gpt-engineer - -Interface - -| Task | Results | -|-------------|--------------------| -| Write File | :white_check_mark: | -| Read File | :x: | -| Search File | :x: | - -Code - -| Task | Results | -|-----------------------------------|----------------------| -| Debug Simple Typo With Guidance | :x: | -| Debug Simple Typo Without Guidance| :x: | -| Basic Code Generation | :white_check_mark: | -| Create Simple Web Server | :x: | - -### mini-agi -Coming Soon! - -### smol-developer -Interface - -| Task | Results | -|-------------|--------------------| -| Write File | :white_check_mark: | -| Read File | :x: | -| Search File | :x: | - -Code - -| Task | Results | -|-----------------------------------|----------------------| -| Debug Simple Typo With Guidance | :x: | -| Debug Simple Typo Without Guidance| :x: | -| Basic Code Generation | :white_check_mark: | -| Create Simple Web Server | :x: | -- cgit v1.2.3 From bb654734167927b2d1e8673b6de13797dbad8dd6 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sat, 15 Jul 2023 08:57:28 -0700 Subject: Update Auto-GPT to current version of master (#105) --- agent/Auto-GPT | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 9079f6641..357a918ec 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 9079f66417f2480d0f5764fb0f916d3241b3fae8 +Subproject commit 357a918ecc9936207c70cf363bb95d74ec510e84 -- cgit v1.2.3 From dab4e90e157d65d5257880f1d818cd97a1b77030 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sat, 15 Jul 2023 09:53:56 -0700 Subject: Update Auto-GPT score (#106) Signed-off-by: Merwane Hamadi --- README.md | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 3011adedc..368c79ee4 100644 --- a/README.md +++ b/README.md @@ -10,28 +10,28 @@ Radio chart for each agent coming soon ! Interface -| Task | Auto-GPT | gpt-engineer | mini-agi | smol-developer | -|--------------|--------------------|--------------------|----------|--------------------| -| Write File | :white_check_mark: | :white_check_mark: | tbd | :white_check_mark: | -| Read File | :white_check_mark: | :x: | tbd | :x: | -| Search File | :x: | :x: | tbd | :x: | +| Task | Auto-GPT | gpt-engineer | mini-agi | smol-developer | +|--------------|----------|--------------------|----------|--------------------| +| Write File | :x: | :white_check_mark: | tbd | :white_check_mark: | +| Read File | :x: | :x: | tbd | :x: | +| Search File | :x: | :x: | tbd | :x: | Code -| Task | Auto-GPT | gpt-engineer | mini-agi | smol-developer | -|------------------------------------|--------------------|--------------------|----------|--------------------| -| Debug Simple Typo With Guidance | :x: | :x: | tbd | :x: | -| Debug Simple Typo Without Guidance | :x: | :x: | tbd | :x: | -| Basic Code Generation | :white_check_mark: | :white_check_mark: | tbd | :white_check_mark: | -| Create Simple Web Server | :x: | :x: | tbd | :x: | +| Task | Auto-GPT | gpt-engineer | mini-agi | smol-developer | +|------------------------------------|----------|--------------------|----------|--------------------| +| Debug Simple Typo With Guidance | :x: | :x: | tbd | :x: | +| Debug Simple Typo Without Guidance | :x: | :x: | tbd | :x: | +| Basic Code Generation | :x: | :white_check_mark: | tbd | :white_check_mark: | +| Create Simple Web Server | :x: | :x: | tbd | :x: | Memory -| Task | Auto-GPT | -|--------------------------------------------|--------------------| -| Basic Memory | :white_check_mark: | -| Remember Multiple Ids | :x: | -| Remember Multiple Ids With Noise | :x: | -| Remember Multiple Phrases With Noise | :x: | +| Task | Auto-GPT | +|--------------------------------------------|----------| +| Basic Memory | :x: | +| Remember Multiple Ids | :x: | +| Remember Multiple Ids With Noise | :x: | +| Remember Multiple Phrases With Noise | :x: | -- cgit v1.2.3 From cbd2e49d973a344e9fce1e55e4ed4bf7e9c26e57 Mon Sep 17 00:00:00 2001 From: Erik Peterson Date: Sat, 15 Jul 2023 16:23:49 -0700 Subject: Clean up workspace between each test (#109) --- agbenchmark/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 32151b8ad..952588105 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -53,7 +53,7 @@ def config(request: Any) -> None: return config -@pytest.fixture(scope="module", autouse=True) +@pytest.fixture(autouse=True) def workspace(config: Dict[str, Any]) -> Generator[str, None, None]: output_path = config["workspace"] -- cgit v1.2.3 From 5886d7505914a53ad47f0e41087581e187178ae6 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sat, 15 Jul 2023 16:52:42 -0700 Subject: Add three sum challenge (#108) Co-authored-by: Silen Naihin --- .github/workflows/ci.yml | 9 ++++--- agbenchmark/challenges/code/d1/data.json | 2 +- agbenchmark/challenges/code/d2/data.json | 2 +- agbenchmark/challenges/code/d4/data.json | 2 +- .../challenges/code/d5/artifacts_out/__init__.py | 0 .../challenges/code/d5/artifacts_out/code.py | 23 ++++++++++++++++ .../challenges/code/d5/custom_python/test.py | 31 ++++++++++++++++++++++ agbenchmark/challenges/code/d5/data.json | 18 +++++++++++++ agent/gpt-engineer | 2 +- 9 files changed, 82 insertions(+), 7 deletions(-) create mode 100644 agbenchmark/challenges/code/d5/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d5/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d5/custom_python/test.py create mode 100644 agbenchmark/challenges/code/d5/data.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b7864db6a..dbb0a6ace 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,7 @@ jobs: if: success() || failure() tests: - name: ${{ matrix.agent-name }} + name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})" runs-on: ubuntu-latest timeout-minutes: 10 env: @@ -77,6 +77,8 @@ jobs: - "smol-developer" - "Auto-GPT" - "mini-agi" + cache-enabled: [ true, false ] + steps: - name: Checkout repository uses: actions/checkout@v3 @@ -156,7 +158,7 @@ jobs: PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards. HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }} REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt - HELICONE_CACHE_ENABLED: true + HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }} HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} @@ -164,5 +166,6 @@ jobs: if: always() uses: actions/upload-artifact@v3 with: - name: ${{ matrix.agent-name }} + name: + ${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }}) path: agent/${{ matrix.agent-name }}/agbenchmark diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index 061c924f5..bc1a15b42 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -1,6 +1,6 @@ { "name": "TestDebugSimpleTypoWithGuidance", - "category": ["code"], + "category": ["code", "iterate"], "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": ["TestReadFile", "TestWriteFile"], "ground": { diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index 6523ef1d8..fca86f29b 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -1,6 +1,6 @@ { "name": "TestDebugSimpleTypoWithoutGuidance", - "category": ["code"], + "category": ["code", "iterate"], "task": "Make test.py run without errors.", "dependencies": ["TestDebugSimpleTypoWithGuidance"], "ground": { diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json index 51f6f2702..b2320a4e5 100644 --- a/agbenchmark/challenges/code/d4/data.json +++ b/agbenchmark/challenges/code/d4/data.json @@ -1,6 +1,6 @@ { "name": "TestBasicCodeGeneration", - "category": ["code", "iterate"], + "category": ["code"], "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", "dependencies": ["TestWriteFile"], "ground": { diff --git a/agbenchmark/challenges/code/d5/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d5/artifacts_out/code.py b/agbenchmark/challenges/code/d5/artifacts_out/code.py new file mode 100644 index 000000000..6056691da --- /dev/null +++ b/agbenchmark/challenges/code/d5/artifacts_out/code.py @@ -0,0 +1,23 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def three_sum(nums: List[int], target: int) -> Optional[List[int]]: + nums_indices = [(num, index) for index, num in enumerate(nums)] + nums_indices.sort() + for i in range(len(nums_indices) - 2): + if i > 0 and nums_indices[i] == nums_indices[i - 1]: + continue + l, r = i + 1, len(nums_indices) - 1 + while l < r: + three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0] + if three_sum < target: + l += 1 + elif three_sum > target: + r -= 1 + else: + indices = sorted( + [nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]] + ) + return indices + return None diff --git a/agbenchmark/challenges/code/d5/custom_python/test.py b/agbenchmark/challenges/code/d5/custom_python/test.py new file mode 100644 index 000000000..761b9f5c6 --- /dev/null +++ b/agbenchmark/challenges/code/d5/custom_python/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import three_sum +from typing import List + + +def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None: + result = three_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first three numbers + nums = [2, 7, 11, 15] + target = 20 + expected_result = [0, 1, 2] + test_three_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 2 + expected_result = [0, 2, 5] + test_three_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = 9 + expected_result = [0, 2, 3] + test_three_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json new file mode 100644 index 000000000..4b44c6943 --- /dev/null +++ b/agbenchmark/challenges/code/d5/data.json @@ -0,0 +1,18 @@ +{ + "name": "TestThreeSum", + "category": ["code", "iterate"], + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"], + "ground": { + "answer": "The three_sum function coded properly.", + "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "intermediate", + "description": "Tests ability for the agent to create the three_sum function.", + "side_effects": [] + } +} diff --git a/agent/gpt-engineer b/agent/gpt-engineer index bca191cd7..f0c76918d 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit bca191cd76cdea0335da91d004c64d9bb8520fea +Subproject commit f0c76918dff7a6cf5e0611a09b060fc5d4913b82 -- cgit v1.2.3 From 02dce4193780ba6d4c0225b3c21da16ecca51ab4 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sat, 15 Jul 2023 18:00:37 -0700 Subject: Fix ci (#110) --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dbb0a6ace..7f6959807 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -149,7 +149,7 @@ jobs: agbenchmark start --mock --category=iterate else curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start - agbenchmark start --maintain + agbenchmark start | echo "This command will always return a non zero exit code unless all the challenges are solved." fi env: GITHUB_EVENT_NAME: ${{ github.event_name }} @@ -167,5 +167,5 @@ jobs: uses: actions/upload-artifact@v3 with: name: - ${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }}) + "${{ matrix.agent-name }} (Cache ${{ matrix.cache-enabled }})" path: agent/${{ matrix.agent-name }}/agbenchmark -- cgit v1.2.3 From 757baba3ff61f354359720667e136e40a54ae7f0 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sat, 15 Jul 2023 18:09:29 -0700 Subject: Remove cache true on pr (#111) Signed-off-by: Merwane Hamadi --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7f6959807..907c21267 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,6 +64,7 @@ jobs: if: success() || failure() tests: + if: github.event_name != 'pull_request' || matrix.cache-enabled == false name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})" runs-on: ubuntu-latest timeout-minutes: 10 -- cgit v1.2.3 From 9f3a2d4f05702bf44b0b938582c5dd6f9a459ea2 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sat, 15 Jul 2023 22:10:20 -0400 Subject: Dynamic cutoff and other quality of life (#101) --- .gitmodules | 2 +- agbenchmark/agent_interface.py | 12 +- agbenchmark/challenge.py | 11 +- agbenchmark/challenges/code/d1/data.json | 1 + agbenchmark/challenges/code/d2/data.json | 1 + agbenchmark/challenges/code/d3/data.json | 1 + agbenchmark/challenges/code/d4/data.json | 1 + agbenchmark/challenges/code/d5/data.json | 1 + agbenchmark/challenges/define_task_types.py | 1 + .../challenges/interface/read_file/data.json | 1 + agbenchmark/challenges/interface/search/data.json | 1 + .../challenges/interface/write_file/data.json | 1 + agbenchmark/challenges/memory/m1/data.json | 1 + agbenchmark/challenges/memory/m2/data.json | 1 + agbenchmark/challenges/memory/m3/data.json | 1 + agbenchmark/challenges/memory/m4/data.json | 1 + agbenchmark/challenges/retrieval/r1/data.json | 1 + agbenchmark/challenges/retrieval/r2/data.json | 1 + agbenchmark/challenges/retrieval/r3/data.json | 1 + agbenchmark/challenges/test_all.py | 3 +- agbenchmark/config.json | 3 +- agbenchmark/conftest.py | 33 ++--- agbenchmark/internal_info.json | 8 +- agbenchmark/regression_tests.json | 19 +-- agbenchmark/reports/1.json | 148 --------------------- agbenchmark/reports/file1_07-14-18-54.json | 147 ++++++++++++++++++++ agbenchmark/start_benchmark.py | 2 +- agbenchmark/utils.py | 15 ++- agent/Auto-GPT | 2 +- agent/SuperAGI | 2 +- agent/config_example.json | 3 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- 34 files changed, 221 insertions(+), 211 deletions(-) delete mode 100644 agbenchmark/reports/1.json create mode 100644 agbenchmark/reports/file1_07-14-18-54.json diff --git a/.gitmodules b/.gitmodules index f14b5e07d..d2b71f9c4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "agent/Auto-GPT"] path = agent/Auto-GPT - url = https://github.com/Significant-Gravitas/Auto-GPT.git + url = https://github.com/merwanehamadi/Auto-GPT.git branch = benchmark-integration [submodule "agent/gpt-engineer"] path = agent/gpt-engineer diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 991a7e8e0..897f4f8cf 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -16,9 +16,7 @@ MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False def run_agent( - task: str, - config: Dict[str, Any], - challenge_location: str, + task: str, config: Dict[str, Any], challenge_location: str, cutoff: int ) -> None: """Calling to get a response""" @@ -27,9 +25,7 @@ def run_agent( config["workspace"], "artifacts_out", challenge_location ) else: - print( - f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}" - ) + print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}") command = [sys.executable, "-m", config["entry_path"], str(task)] process = subprocess.Popen( command, @@ -50,11 +46,11 @@ def run_agent( if ( process.poll() is not None or output == "" - or (time.time() - start_time > config["cutoff"]) + or (time.time() - start_time > cutoff) ): break - if time.time() - start_time > config["cutoff"]: + if time.time() - start_time > cutoff: print("The Python function has exceeded the time limit and was terminated.") process.kill() else: diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index f07faf8ee..4f24bb603 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -4,16 +4,9 @@ import subprocess from abc import ABC from typing import Any, Dict, List -from dotenv import load_dotenv - from agbenchmark.challenges.define_task_types import ChallengeData, Ground from agbenchmark.start_benchmark import CURRENT_DIRECTORY -load_dotenv() - -mock_test_str = os.getenv("MOCK_TEST") -MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False - class Challenge(ABC): """The parent class to all specific challenges classes. @@ -37,14 +30,14 @@ class Challenge(ABC): def dependencies(self) -> list: return self.data.dependencies - def setup_challenge(self, config: Dict[str, Any]) -> None: + def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None: from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent copy_artifacts_into_workspace( config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION ) - run_agent(self.task, config, self.CHALLENGE_LOCATION) + run_agent(self.task, config, self.CHALLENGE_LOCATION, cutoff) # hidden files are added after the agent runs. Hidden files can be python test files. # We copy them in the workspace to make it easy to import the code produced by the agent diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json index bc1a15b42..d8e0280a4 100644 --- a/agbenchmark/challenges/code/d1/data.json +++ b/agbenchmark/challenges/code/d1/data.json @@ -3,6 +3,7 @@ "category": ["code", "iterate"], "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", "dependencies": ["TestReadFile", "TestWriteFile"], + "cutoff": 60, "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json index fca86f29b..de32ef9a7 100644 --- a/agbenchmark/challenges/code/d2/data.json +++ b/agbenchmark/challenges/code/d2/data.json @@ -3,6 +3,7 @@ "category": ["code", "iterate"], "task": "Make test.py run without errors.", "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "cutoff": 60, "ground": { "answer": "[0, 1] [2, 5] [0, 3]", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json index ae0e45581..c5d111a4d 100644 --- a/agbenchmark/challenges/code/d3/data.json +++ b/agbenchmark/challenges/code/d3/data.json @@ -3,6 +3,7 @@ "category": ["code"], "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "cutoff": 60, "ground": { "answer": "GET localhost:8079/health responds with a 200 OK", "should_contain": [], diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json index b2320a4e5..e8db918d2 100644 --- a/agbenchmark/challenges/code/d4/data.json +++ b/agbenchmark/challenges/code/d4/data.json @@ -3,6 +3,7 @@ "category": ["code"], "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", "dependencies": ["TestWriteFile"], + "cutoff": 60, "ground": { "answer": "The two_sum function coded properly.", "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json index 4b44c6943..434b1312e 100644 --- a/agbenchmark/challenges/code/d5/data.json +++ b/agbenchmark/challenges/code/d5/data.json @@ -3,6 +3,7 @@ "category": ["code", "iterate"], "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"], + "cutoff": 60, "ground": { "answer": "The three_sum function coded properly.", "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"], diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py index 668025dd2..dc1777d71 100644 --- a/agbenchmark/challenges/define_task_types.py +++ b/agbenchmark/challenges/define_task_types.py @@ -61,6 +61,7 @@ class ChallengeData(BaseModel): category: List[str] task: str dependencies: List[str] + cutoff: int ground: Ground info: Info diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json index 1bf340454..55878856b 100644 --- a/agbenchmark/challenges/interface/read_file/data.json +++ b/agbenchmark/challenges/interface/read_file/data.json @@ -3,6 +3,7 @@ "category": ["interface"], "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", "dependencies": ["TestWriteFile"], + "cutoff": 60, "ground": { "answer": "random string Hello World!", "should_contain": ["random string", "Hello World!"], diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json index de8934d95..793e71715 100644 --- a/agbenchmark/challenges/interface/search/data.json +++ b/agbenchmark/challenges/interface/search/data.json @@ -3,6 +3,7 @@ "category": ["interface"], "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", "dependencies": ["TestWriteFile"], + "cutoff": 60, "ground": { "answer": "This is a Heading\nThis is a paragraph.", "should_contain": ["Heading", "paragraph"], diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json index 8db9cd620..426e6c3d4 100644 --- a/agbenchmark/challenges/interface/write_file/data.json +++ b/agbenchmark/challenges/interface/write_file/data.json @@ -3,6 +3,7 @@ "category": ["interface"], "task": "Print the the capital of America to a .txt file", "dependencies": [], + "cutoff": 60, "ground": { "answer": "Washington", "should_contain": ["Washington"], diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json index ab86f1c3c..33c523244 100644 --- a/agbenchmark/challenges/memory/m1/data.json +++ b/agbenchmark/challenges/memory/m1/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestReadFile", "TestWriteFile"], + "cutoff": 60, "ground": { "answer": "2314", "should_contain": ["2314"], diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json index 9205c99f1..840e8dc83 100644 --- a/agbenchmark/challenges/memory/m2/data.json +++ b/agbenchmark/challenges/memory/m2/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestBasicMemory"], + "cutoff": 60, "ground": { "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json index 3b78d614b..3af2fb3a5 100644 --- a/agbenchmark/challenges/memory/m3/data.json +++ b/agbenchmark/challenges/memory/m3/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIds"], + "cutoff": 60, "ground": { "answer": "3145\n3791\n9317\n9471", "should_contain": ["3145", "3791", "9317", "9471"], diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json index 84f5c2b21..17a078e1b 100644 --- a/agbenchmark/challenges/memory/m4/data.json +++ b/agbenchmark/challenges/memory/m4/data.json @@ -3,6 +3,7 @@ "category": ["memory"], "task": "Follow the instructions in the instructions_1.txt file", "dependencies": ["TestRememberMultipleIdsWithNoise"], + "cutoff": 60, "ground": { "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", "should_contain": [ diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json index e3e09302d..c3af4862d 100644 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ b/agbenchmark/challenges/retrieval/r1/data.json @@ -3,6 +3,7 @@ "category": ["retrieval"], "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", "dependencies": ["TestWriteFile", "TestSearch"], + "cutoff": 60, "ground": { "answer": "£25.89", "should_contain": ["25.89"], diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json index 977be4bcd..f558b8584 100644 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ b/agbenchmark/challenges/retrieval/r2/data.json @@ -3,6 +3,7 @@ "category": ["retrieval"], "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", "dependencies": ["TestBasicRetrieval"], + "cutoff": 60, "ground": { "answer": "81,462", "should_contain": ["81,462"], diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index 5504908ea..eb998ffbf 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -3,6 +3,7 @@ "category": ["retrieval"], "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", "dependencies": ["TestRetrieval2"], + "cutoff": 60, "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", "should_contain": [ diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py index 98a5ab81a..255b39e57 100644 --- a/agbenchmark/challenges/test_all.py +++ b/agbenchmark/challenges/test_all.py @@ -54,7 +54,8 @@ def generate_tests() -> None: # Define test method within the dynamically created class def test_method(self, config: Dict[str, Any]) -> None: # type: ignore - self.setup_challenge(config) + cutoff = self.data.cutoff or 60 + self.setup_challenge(config, cutoff) scores = self.get_scores(config) diff --git a/agbenchmark/config.json b/agbenchmark/config.json index af83029ef..820f133b1 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,5 +1,4 @@ { "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks", - "cutoff": 60 + "entry_path": "agbenchmark.benchmarks" } diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 952588105..245df485e 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -18,12 +18,10 @@ from agbenchmark.start_benchmark import ( from agbenchmark.utils import calculate_success_percentage -def resolve_workspace(config: Dict[str, Any]) -> str: - if config.get("workspace", "").startswith("${") and config.get( - "workspace", "" - ).endswith("}"): +def resolve_workspace(workspace: str) -> str: + if workspace.startswith("${") and workspace.endswith("}"): # Extract the string inside ${...} - path_expr = config["workspace"][2:-1] + path_expr = workspace[2:-1] # Check if it starts with "os.path.join" if path_expr.strip().startswith("os.path.join"): @@ -35,7 +33,7 @@ def resolve_workspace(config: Dict[str, Any]) -> str: else: raise ValueError("Invalid workspace path expression.") else: - return os.path.abspath(Path(os.getcwd()) / config["workspace"]) + return os.path.abspath(Path(os.getcwd()) / workspace) @pytest.fixture(scope="module") @@ -45,10 +43,10 @@ def config(request: Any) -> None: config = json.load(f) if isinstance(config["workspace"], str): - config["workspace"] = resolve_workspace(config) + config["workspace"] = resolve_workspace(config["workspace"]) else: # it's a input output dict - config["workspace"]["input"] = resolve_workspace(config) - config["workspace"]["output"] = resolve_workspace(config) + config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"]) + config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"]) return config @@ -173,18 +171,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: regression_manager.remove_test(test_name) info_details["metrics"]["fail_reason"] = str(call.excinfo.value) - prev_test_results: list[bool] = [] - + prev_test_results: list[bool] = internal_info.tests.get(test_name, []) if not mock: # only add if it's an actual test - prev_test_results = internal_info.tests.get(test_name, []) prev_test_results.append(info_details["metrics"]["success"]) internal_info.add_test(test_name, prev_test_results) - # can calculate success rate regardless of mock - info_details["metrics"]["success_%"] = calculate_success_percentage( - prev_test_results - ) + # can calculate success rate regardless of mock + info_details["metrics"]["success_%"] = calculate_success_percentage( + prev_test_results + ) + else: + # can calculate success rate regardless of mock + info_details["metrics"][ + "non_mock_success_%" + ] = calculate_success_percentage(prev_test_results) if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]: # if the last 3 tests were successful, add to the regression tests diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json index 5f46bd854..95a051d54 100644 --- a/agbenchmark/internal_info.json +++ b/agbenchmark/internal_info.json @@ -62,6 +62,12 @@ "TestWriteFile": [ true, true, - true + true, + false, + false, + false, + false, + true, + false ] } \ No newline at end of file diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json index ce73ce263..25591a4de 100644 --- a/agbenchmark/regression_tests.json +++ b/agbenchmark/regression_tests.json @@ -16,57 +16,52 @@ "data_path": "agbenchmark/challenges/retrieval/r1" }, "TestReadFile": { - "difficulty": "basic", + "difficulty": "interface", "dependencies": [ "TestWriteFile" ], "data_path": "agbenchmark/challenges/interface/read_file" }, "TestRememberMultipleIds": { - "difficulty": "basic", + "difficulty": "novice", "dependencies": [ "TestBasicMemory" ], "data_path": "agbenchmark/challenges/memory/m2" }, "TestRememberMultipleIdsWithNoise": { - "difficulty": "medium", + "difficulty": "intermediate", "dependencies": [ "TestRememberMultipleIds" ], "data_path": "agbenchmark/challenges/memory/m3" }, "TestRememberMultiplePhrasesWithNoise": { - "difficulty": "medium", + "difficulty": "advanced", "dependencies": [ "TestRememberMultipleIdsWithNoise" ], "data_path": "agbenchmark/challenges/memory/m4" }, "TestRetrieval2": { - "difficulty": "basic", + "difficulty": "novice", "dependencies": [ "TestBasicRetrieval" ], "data_path": "agbenchmark/challenges/retrieval/r2" }, "TestRetrieval3": { - "difficulty": "basic", + "difficulty": "intermediate", "dependencies": [ "TestRetrieval2" ], "data_path": "agbenchmark/challenges/retrieval/r3" }, "TestSearch": { - "difficulty": "basic", + "difficulty": "interface", "dependencies": [ "TestWriteFile" ], "data_path": "agbenchmark/challenges/interface/search" - }, - "TestWriteFile": { - "difficulty": "basic", - "dependencies": [], - "data_path": "agbenchmark/challenges/interface/write_file" } } \ No newline at end of file diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json deleted file mode 100644 index 45945a3ee..000000000 --- a/agbenchmark/reports/1.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "command": "agbenchmark start --mock", - "completion_time": "2023-07-11-21:09", - "metrics": { - "run_time": "0.96 seconds", - "highest_difficulty": "advanced: 5" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "success_%": 0, - "run_time": "0.008 seconds" - } - }, - "TestReadFile": { - "data_path": "agbenchmark/challenges/interface/read_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "success_%": 0, - "run_time": "0.005 seconds" - } - }, - "TestSearch": { - "data_path": "agbenchmark/challenges/interface/search", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "success_%": 0, - "run_time": "0.006 seconds" - } - }, - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "success_%": 0, - "run_time": "0.489 seconds" - } - }, - "TestBasicMemory": { - "data_path": "agbenchmark/challenges/memory/m1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": true, - "success_%": 0, - "run_time": "0.02 seconds" - } - }, - "TestBasicRetrieval": { - "data_path": "agbenchmark/challenges/retrieval/r1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": true, - "success_%": 0, - "run_time": "0.01 seconds" - } - }, - "TestDebugSimpleTypoWithoutGuidance": { - "data_path": "agbenchmark/challenges/code/d2", - "is_regression": false, - "metrics": { - "difficulty": "novice", - "success": false, - "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "success_%": 0, - "run_time": "0.001 seconds" - } - }, - "TestCreateSimpleWebServer": { - "data_path": "agbenchmark/challenges/code/d3", - "is_regression": false, - "metrics": { - "difficulty": "advanced", - "success": false, - "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "success_%": 0, - "run_time": "0.001 seconds" - } - }, - "TestRememberMultipleIds": { - "data_path": "agbenchmark/challenges/memory/m2", - "is_regression": false, - "metrics": { - "difficulty": "novice", - "success": true, - "success_%": 0, - "run_time": "0.018 seconds" - } - }, - "TestRetrieval2": { - "data_path": "agbenchmark/challenges/retrieval/r2", - "is_regression": false, - "metrics": { - "difficulty": "novice", - "success": true, - "success_%": 0, - "run_time": "0.009 seconds" - } - }, - "TestRememberMultipleIdsWithNoise": { - "data_path": "agbenchmark/challenges/memory/m3", - "is_regression": false, - "metrics": { - "difficulty": "intermediate", - "success": true, - "success_%": 0, - "run_time": "0.022 seconds" - } - }, - "TestRetrieval3": { - "data_path": "agbenchmark/challenges/retrieval/r3", - "is_regression": false, - "metrics": { - "difficulty": "intermediate", - "success": true, - "success_%": 0, - "run_time": "0.01 seconds" - } - }, - "TestRememberMultiplePhrasesWithNoise": { - "data_path": "agbenchmark/challenges/memory/m4", - "is_regression": false, - "metrics": { - "difficulty": "advanced", - "success": true, - "success_%": 0, - "run_time": "0.021 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks", - "cutoff": 60 - } -} \ No newline at end of file diff --git a/agbenchmark/reports/file1_07-14-18-54.json b/agbenchmark/reports/file1_07-14-18-54.json new file mode 100644 index 000000000..f81d19d3d --- /dev/null +++ b/agbenchmark/reports/file1_07-14-18-54.json @@ -0,0 +1,147 @@ +{ + "command": "agbenchmark start --mock", + "completion_time": "2023-07-14-18:54", + "metrics": { + "run_time": "0.97 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "non_mock_success_%": 75.0, + "run_time": "0.007 seconds" + } + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.008 seconds" + } + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.007 seconds" + } + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "non_mock_success_%": 0.0, + "run_time": "0.448 seconds" + } + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": true, + "metrics": { + "difficulty": "basic", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.028 seconds" + } + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": true, + "metrics": { + "difficulty": "basic", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.014 seconds" + } + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "non_mock_success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "non_mock_success_%": 0.0, + "run_time": "0.002 seconds" + } + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": true, + "metrics": { + "difficulty": "novice", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.023 seconds" + } + }, + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": true, + "metrics": { + "difficulty": "novice", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.013 seconds" + } + }, + "TestRememberMultipleIdsWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": true, + "metrics": { + "difficulty": "intermediate", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.03 seconds" + } + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": true, + "metrics": { + "difficulty": "intermediate", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.016 seconds" + } + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": true, + "metrics": { + "difficulty": "advanced", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.034 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + } +} \ No newline at end of file diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index ab2586e60..b31c9f5f9 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -62,7 +62,7 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) - config["entry_path"] = click.prompt( "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder", - default="benchmarks.py", + default="agbenchmark/benchmarks.py", ) config["cutoff"] = click.prompt( diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index 598113d3d..1174e89bb 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1,6 +1,7 @@ # radio charts, logs, helper functions for tests, anything else relevant. import glob import re +from datetime import datetime from pathlib import Path from typing import Any @@ -12,11 +13,13 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str: if not INFO_TESTS_PATH.exists(): INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True) - return str(INFO_TESTS_PATH / "1.json") + return str( + INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" + ) else: json_files = glob.glob(str(INFO_TESTS_PATH / "*.json")) file_count = len(json_files) - run_name = f"{file_count + 1}.json" + run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" new_file_path = INFO_TESTS_PATH / run_name return str(new_file_path) @@ -35,8 +38,10 @@ def replace_backslash(value: Any) -> Any: def calculate_success_percentage(results: list[bool]) -> float: - success_count = results.count(True) - total_count = len(results) + # Take the last 10 results or all if less than 10 + last_results = results[-10:] if len(results) > 10 else results + success_count = last_results.count(True) + total_count = len(last_results) if total_count == 0: return 0 success_percentage = (success_count / total_count) * 100 # as a percentage @@ -45,7 +50,7 @@ def calculate_success_percentage(results: list[bool]) -> float: def get_highest_success_difficulty(data: dict) -> str: highest_difficulty = None - highest_difficulty_level = -1 + highest_difficulty_level = 0 for test_name, test_data in data.items(): if test_data["metrics"]["success"]: diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 357a918ec..62ad7aa8c 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 357a918ecc9936207c70cf363bb95d74ec510e84 +Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16 diff --git a/agent/SuperAGI b/agent/SuperAGI index bd4b3def6..f880b2464 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007 +Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b diff --git a/agent/config_example.json b/agent/config_example.json index 7ab65bc20..9e8bd3f08 100644 --- a/agent/config_example.json +++ b/agent/config_example.json @@ -1,5 +1,4 @@ { "workspace": "projects/my-new-project/workspace", - "entry_path": "agbenchmark/benchmarks.py", - "cutoff": 60 + "entry_path": "agbenchmark/benchmarks.py" } diff --git a/agent/gpt-engineer b/agent/gpt-engineer index f0c76918d..a0162df0d 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit f0c76918dff7a6cf5e0611a09b060fc5d4913b82 +Subproject commit a0162df0db24be0c888ad56d12bd59d6130d32f0 diff --git a/agent/mini-agi b/agent/mini-agi index 08764876d..0f8eba95d 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0 +Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9 diff --git a/agent/smol-developer b/agent/smol-developer index f4f439551..70b57dd04 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit f4f4395511ed6ba59ec09100d6596bf81d68a898 +Subproject commit 70b57dd042bea14d6e21d56e9e115ee0fc9676f7 -- cgit v1.2.3 From 2704bcee5ef86eb3da75139a08f618135f66d754 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 16 Jul 2023 07:26:36 -0700 Subject: Allow change location of reports (#115) Signed-off-by: Merwane Hamadi --- .github/workflows/ci.yml | 7 +++---- agbenchmark/utils.py | 6 +++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 907c21267..9d4769e76 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,7 +64,6 @@ jobs: if: success() || failure() tests: - if: github.event_name != 'pull_request' || matrix.cache-enabled == false name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})" runs-on: ubuntu-latest timeout-minutes: 10 @@ -161,12 +160,12 @@ jobs: REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }} HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} + REPORT_LOCATION: ${{ matrix.cache-enabled == true && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }} - name: Upload reports if: always() uses: actions/upload-artifact@v3 with: - name: - "${{ matrix.agent-name }} (Cache ${{ matrix.cache-enabled }})" - path: agent/${{ matrix.agent-name }}/agbenchmark + name: ${{ matrix.agent-name }} + path: benchmark_runs/${{ matrix.agent-name }} diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index 1174e89bb..506c48847 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1,5 +1,6 @@ # radio charts, logs, helper functions for tests, anything else relevant. import glob +import os import re from datetime import datetime from pathlib import Path @@ -9,7 +10,10 @@ from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyL def calculate_info_test_path(benchmarks_folder_path: Path) -> str: - INFO_TESTS_PATH = benchmarks_folder_path / "reports" + + INFO_TESTS_PATH = ( + benchmarks_folder_path / os.getenv("REPORT_LOCATION", ".") / "reports" + ) if not INFO_TESTS_PATH.exists(): INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True) -- cgit v1.2.3 From 2cfafcfbf02c85182f68e6faa5d50c342f340faa Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 16 Jul 2023 07:54:49 -0700 Subject: Fix cutoff errors (#116) Signed-off-by: Merwane Hamadi --- agent/gpt-engineer | 2 +- agent/smol-developer | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/agent/gpt-engineer b/agent/gpt-engineer index a0162df0d..9bb81041a 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit a0162df0db24be0c888ad56d12bd59d6130d32f0 +Subproject commit 9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36 diff --git a/agent/smol-developer b/agent/smol-developer index 70b57dd04..a23d01369 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit 70b57dd042bea14d6e21d56e9e115ee0fc9676f7 +Subproject commit a23d01369cea976e80b7889fdbf1096619471301 -- cgit v1.2.3 From 117e8c8dd1879dd97fe9933fc2bf9a6b2cd65a92 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 16 Jul 2023 08:10:53 -0700 Subject: Fix pipes issue (#117) --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9d4769e76..aca2e3f5a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -149,7 +149,7 @@ jobs: agbenchmark start --mock --category=iterate else curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start - agbenchmark start | echo "This command will always return a non zero exit code unless all the challenges are solved." + agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved." fi env: GITHUB_EVENT_NAME: ${{ github.event_name }} -- cgit v1.2.3 From b904041ea17829f4fd522a794d5a7b06b95c923b Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Sun, 16 Jul 2023 15:49:36 -0700 Subject: Update reports when pushing to master (#162) Signed-off-by: Merwane Hamadi --- .github/workflows/ci.yml | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aca2e3f5a..a2224ea78 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,8 @@ on: - cron: "0 8 * * *" push: branches: [master, ci-test*] + paths-ignore: + - 'benchmark_runs/**' pull_request: branches: [stable, master, release-*] @@ -64,11 +66,12 @@ jobs: if: success() || failure() tests: + env: + GH_TOKEN: ${{ github.event_name == 'pull_request' && github.token || secrets.PAT }} + min-python-version: "3.10" name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})" runs-on: ubuntu-latest timeout-minutes: 10 - env: - min-python-version: "3.10" strategy: fail-fast: false matrix: @@ -87,6 +90,7 @@ jobs: ref: ${{ github.event.pull_request.head.ref }} repository: ${{ github.event.pull_request.head.repo.full_name }} submodules: true + token: ${{ env.GH_TOKEN }} - name: Set up Python ${{ env.min-python-version }} uses: actions/setup-python@v2 @@ -151,6 +155,9 @@ jobs: curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved." fi + + cd ../.. + env: GITHUB_EVENT_NAME: ${{ github.event_name }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -160,7 +167,7 @@ jobs: REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }} HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} - REPORT_LOCATION: ${{ matrix.cache-enabled == true && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }} + REPORT_LOCATION: ${{ matrix.cache-enabled == false && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }} - name: Upload reports @@ -169,3 +176,18 @@ jobs: with: name: ${{ matrix.agent-name }} path: benchmark_runs/${{ matrix.agent-name }} + + - name: Authenticate and Push to Branch + if: (success() || failure()) && (github.event_name != 'pull_request' && matrix.cache-enabled == false) + run: | + git config --global user.email "github-bot@agpt.co" + git config --global user.name "Auto-GPT-Bot" + + git add benchmark_runs/* || echo "nothing to commit" + commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')" + git commit -m "${commit_message}" + + current_branch=${{ github.ref_name }} + git fetch origin $current_branch + git rebase origin/$current_branch + git push origin HEAD -- cgit v1.2.3 From a36eadf554df292ceaecf549fc02f2e949521c66 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Sun, 16 Jul 2023 22:52:31 +0000 Subject: Auto-GPT-20230716225231 --- .../Auto-GPT/reports/file1_07-16-22-52.json | 179 +++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json diff --git a/benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json b/benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json new file mode 100644 index 000000000..c13e4f59a --- /dev/null +++ b/benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json @@ -0,0 +1,179 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-16-22:52", + "metrics": { + "run_time": "14.51 seconds", + "highest_difficulty": ": 0" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 40.0, + "run_time": "14.286 seconds" + } + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d4", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.002 seconds" + } + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIdsWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace", + "entry_path": "agbenchmark.benchmarks" + } +} \ No newline at end of file -- cgit v1.2.3 From 5c7acbc71986d164e377740cbef7f8bf26e160e3 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Sun, 16 Jul 2023 22:59:08 +0000 Subject: gpt-engineer-20230716225908 --- .../gpt-engineer/reports/file1_07-16-22-51.json | 175 +++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json diff --git a/benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json b/benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json new file mode 100644 index 000000000..acfd01457 --- /dev/null +++ b/benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json @@ -0,0 +1,175 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-16-22:59", + "metrics": { + "run_time": "449.82 seconds", + "highest_difficulty": "novice: 3" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 50.0, + "run_time": "62.5 seconds" + } + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d4", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 100.0, + "run_time": "70.822 seconds" + } + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "68.908 seconds" + } + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'", + "success_%": 75.0, + "run_time": "60.495 seconds" + } + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "69.361 seconds" + } + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": true, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "67.503 seconds" + } + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0, 0.0]", + "success_%": 75.0, + "run_time": "50.064 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIdsWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]", + "success_%": 75.0, + "run_time": "0.001 seconds" + } + } + }, + "config": { + "workspace": "projects/my-new-project/workspace", + "entry_path": "agbenchmark.benchmarks" + } +} \ No newline at end of file -- cgit v1.2.3 From ce4cefe7e7211025994a4eab84c3a96209e705cb Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Sun, 16 Jul 2023 21:24:06 -0400 Subject: Dynamic home path for runs (#119) --- .env.example | 2 +- agbenchmark/README.md | 72 ++-------- agbenchmark/agent_interface.py | 9 +- agbenchmark/config.json | 3 +- agbenchmark/internal_info.json | 12 +- agbenchmark/reports/file1_07-14-18-54.json | 147 --------------------- .../reports/mini-agi/file1_07-16-13-07.json | 23 ++++ agbenchmark/start_benchmark.py | 20 +-- agbenchmark/utils.py | 81 ++++++++++-- agent/Auto-GPT | 2 +- agent/SuperAGI | 2 +- agent/gpt-engineer | 2 +- agent/mini-agi | 2 +- agent/smol-developer | 2 +- 14 files changed, 135 insertions(+), 244 deletions(-) delete mode 100644 agbenchmark/reports/file1_07-14-18-54.json create mode 100644 agbenchmark/reports/mini-agi/file1_07-16-13-07.json diff --git a/.env.example b/.env.example index e50ed58a5..197810bbb 100644 --- a/.env.example +++ b/.env.example @@ -1,3 +1,3 @@ AGENT_NAME=mini-agi -ENVIRONMENT=local +HOME_ENV= MOCK_TEST=False \ No newline at end of file diff --git a/agbenchmark/README.md b/agbenchmark/README.md index 42e2bd4dd..c814e6cff 100644 --- a/agbenchmark/README.md +++ b/agbenchmark/README.md @@ -40,45 +40,6 @@ Let people know what beautiful code you write does, document everything well Share your progress :) -### Pytest - -an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic - -```python -import pytest -from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge -import os - - -class TestWriteFile(BasicChallenge): - """Testing if LLM can write to a file""" - - def test_method(self, config): - # implement scoring logic by looking at workspace -``` - -All challenges will inherit from parent class which has the mark and any specific methods for their category - -```python -@pytest.mark.basic -class BasicChallenge(Challenge): - pass -``` - -Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. - -```python -@pytest.fixture( - scope="module", autouse=True - ) # this is specific to setting up a file for the test, not all tests have this - def setup_module(self, workspace): - Challenge.write_to_file( - workspace, self.data.ground.files[0], "this is how we're doing" - ) -``` - -#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py) - ## Workspace If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users//miniagi` - it will be automitcally set on config @@ -87,29 +48,7 @@ If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-ag Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/ -## Repo - -``` -|-- auto-gpt-benchmarks/ **main project directory** -| |-- metrics.py **combining scores, metrics, final evaluation** -| |-- start_benchmark.py **entry point from cli** -| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization** -| |-- Challenge.py **easy challenge creation class** -| |-- config.json **workspace folder** -| |-- challenges/ **challenges across different domains** -| | |-- adaptability/ -| | |-- basic_abilities/ -| | |-- code/ -| | |-- memory/ -| | |-- retrieval/ -| | |-- web_navigation/ -| | |-- writing/ -| |-- tests/ -| | |-- basic_abilities/ **every llm should pass these challenges** -| | |-- regression/ **challenges that already passed** -``` - -## How to add new agents to agbenchmark ? +## How do I add new agents to agbenchmark ? Example with smol developer. @@ -120,3 +59,12 @@ https://github.com/smol-ai/developer/pull/114/files 2- Create the submodule and the github workflow by following the same pattern as this example: https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files + +## How do I run agent in different environments? + +**To just use as the benchmark for your agent**. `pip install` the package and run `agbenchmark start` + +**For internal Auto-GPT ci runs**, specify the `AGENT_NAME` you want you use and set the `HOME_ENV`. +Ex. `HOME_ENV=ci AGENT_NAME=mini-agi` + +**To develop agent alongside benchmark**, you can specify the `AGENT_NAME` you want you use and add as a submodule to the repo diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py index 897f4f8cf..ff5bc8909 100644 --- a/agbenchmark/agent_interface.py +++ b/agbenchmark/agent_interface.py @@ -7,7 +7,7 @@ from typing import Any, Dict from dotenv import load_dotenv -from agbenchmark.start_benchmark import CURRENT_DIRECTORY +from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY load_dotenv() @@ -25,13 +25,16 @@ def run_agent( config["workspace"], "artifacts_out", challenge_location ) else: - print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}") - command = [sys.executable, "-m", config["entry_path"], str(task)] + entry_path = "agbenchmark.benchmarks" + + print(f"Running Python function '{entry_path}' with timeout {cutoff}") + command = [sys.executable, "-m", entry_path, str(task)] process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, + cwd=HOME_DIRECTORY, ) start_time = time.time() diff --git a/agbenchmark/config.json b/agbenchmark/config.json index 820f133b1..3a03b7412 100644 --- a/agbenchmark/config.json +++ b/agbenchmark/config.json @@ -1,4 +1,3 @@ { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" } diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json index 95a051d54..0e34ad7a3 100644 --- a/agbenchmark/internal_info.json +++ b/agbenchmark/internal_info.json @@ -15,6 +15,8 @@ false ], "TestDebugSimpleTypoWithGuidance": [ + false, + false, false, false, false @@ -25,6 +27,7 @@ false ], "TestReadFile": [ + true, true, true, true @@ -55,6 +58,7 @@ true ], "TestSearch": [ + true, true, true, true @@ -68,6 +72,12 @@ false, false, true, - false + false, + true, + false, + false, + false, + false, + true ] } \ No newline at end of file diff --git a/agbenchmark/reports/file1_07-14-18-54.json b/agbenchmark/reports/file1_07-14-18-54.json deleted file mode 100644 index f81d19d3d..000000000 --- a/agbenchmark/reports/file1_07-14-18-54.json +++ /dev/null @@ -1,147 +0,0 @@ -{ - "command": "agbenchmark start --mock", - "completion_time": "2023-07-14-18:54", - "metrics": { - "run_time": "0.97 seconds", - "highest_difficulty": "advanced: 5" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 75.0, - "run_time": "0.007 seconds" - } - }, - "TestReadFile": { - "data_path": "agbenchmark/challenges/interface/read_file", - "is_regression": true, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.008 seconds" - } - }, - "TestSearch": { - "data_path": "agbenchmark/challenges/interface/search", - "is_regression": true, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.007 seconds" - } - }, - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "non_mock_success_%": 0.0, - "run_time": "0.448 seconds" - } - }, - "TestBasicMemory": { - "data_path": "agbenchmark/challenges/memory/m1", - "is_regression": true, - "metrics": { - "difficulty": "basic", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.028 seconds" - } - }, - "TestBasicRetrieval": { - "data_path": "agbenchmark/challenges/retrieval/r1", - "is_regression": true, - "metrics": { - "difficulty": "basic", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.014 seconds" - } - }, - "TestDebugSimpleTypoWithoutGuidance": { - "data_path": "agbenchmark/challenges/code/d2", - "is_regression": false, - "metrics": { - "difficulty": "novice", - "success": false, - "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "non_mock_success_%": 0.0, - "run_time": "0.001 seconds" - } - }, - "TestCreateSimpleWebServer": { - "data_path": "agbenchmark/challenges/code/d3", - "is_regression": false, - "metrics": { - "difficulty": "advanced", - "success": false, - "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "non_mock_success_%": 0.0, - "run_time": "0.002 seconds" - } - }, - "TestRememberMultipleIds": { - "data_path": "agbenchmark/challenges/memory/m2", - "is_regression": true, - "metrics": { - "difficulty": "novice", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.023 seconds" - } - }, - "TestRetrieval2": { - "data_path": "agbenchmark/challenges/retrieval/r2", - "is_regression": true, - "metrics": { - "difficulty": "novice", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.013 seconds" - } - }, - "TestRememberMultipleIdsWithNoise": { - "data_path": "agbenchmark/challenges/memory/m3", - "is_regression": true, - "metrics": { - "difficulty": "intermediate", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.03 seconds" - } - }, - "TestRetrieval3": { - "data_path": "agbenchmark/challenges/retrieval/r3", - "is_regression": true, - "metrics": { - "difficulty": "intermediate", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.016 seconds" - } - }, - "TestRememberMultiplePhrasesWithNoise": { - "data_path": "agbenchmark/challenges/memory/m4", - "is_regression": true, - "metrics": { - "difficulty": "advanced", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.034 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" - } -} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json new file mode 100644 index 000000000..78bafc5f1 --- /dev/null +++ b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json @@ -0,0 +1,23 @@ +{ + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-16-13:07", + "metrics": { + "run_time": "13.91 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 30.0, + "run_time": "13.684 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py index b31c9f5f9..ea17d1523 100644 --- a/agbenchmark/start_benchmark.py +++ b/agbenchmark/start_benchmark.py @@ -6,20 +6,17 @@ from typing import Any import click import pytest -from dotenv import load_dotenv -load_dotenv() - -from agbenchmark.utils import calculate_info_test_path +from agbenchmark.utils import calculate_dynamic_paths CURRENT_DIRECTORY = Path(__file__).resolve().parent -benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark" - -CONFIG_PATH = str(benchmarks_folder_path / "config.json") -REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json") - -INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path) +( + HOME_DIRECTORY, + CONFIG_PATH, + REGRESSION_TESTS_PATH, + INFO_TESTS_PATH, +) = calculate_dynamic_paths() @click.group() @@ -48,9 +45,6 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) - ) return 1 - if not benchmarks_folder_path.exists(): - benchmarks_folder_path.mkdir(exist_ok=True) - print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size) if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0: config = {} diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index 506c48847..c69509c70 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -6,25 +6,28 @@ from datetime import datetime from pathlib import Path from typing import Any -from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel +from dotenv import load_dotenv + +load_dotenv() +from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel -def calculate_info_test_path(benchmarks_folder_path: Path) -> str: +AGENT_NAME = os.getenv("AGENT_NAME") +HOME_ENV = os.getenv("HOME_ENV") - INFO_TESTS_PATH = ( - benchmarks_folder_path / os.getenv("REPORT_LOCATION", ".") / "reports" - ) - if not INFO_TESTS_PATH.exists(): - INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True) +def calculate_info_test_path(reports_path: Path) -> str: + print("reports_pathreports_pathreports_pathreports_path", reports_path) + if not reports_path.exists(): + reports_path.mkdir(parents=True, exist_ok=True) return str( - INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" + reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" ) else: - json_files = glob.glob(str(INFO_TESTS_PATH / "*.json")) + json_files = glob.glob(str(reports_path / "*.json")) file_count = len(json_files) run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" - new_file_path = INFO_TESTS_PATH / run_name + new_file_path = reports_path / run_name return str(new_file_path) @@ -79,3 +82,61 @@ def get_highest_success_difficulty(data: dict) -> str: highest_difficulty_str = "" return f"{highest_difficulty_str}: {highest_difficulty_level}" + + +def assign_paths(folder_path: Path) -> tuple[str, str, str]: + CONFIG_PATH = str(folder_path / "config.json") + REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json") + + if HOME_ENV == "ci" and AGENT_NAME: + INFO_TESTS_PATH = calculate_info_test_path( + Path(os.getcwd()) / "agbenchmark" / "reports" / AGENT_NAME + ) + else: + INFO_TESTS_PATH = calculate_info_test_path(folder_path / "reports") + + return CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH + + +def calculate_dynamic_paths() -> tuple[Path, str, str, str]: + # the default home is where you're running from + HOME_DIRECTORY = Path(os.getcwd()) + benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark" + + if AGENT_NAME and HOME_ENV == "ci": + if "/Auto-GPT-Benchmarks/agent" in str(HOME_DIRECTORY): + raise Exception("Must run from root of benchmark repo if HOME_ENV is ci") + + # however if the env is local and the agent name is defined, we want to run that agent from the repo and then get the data in the internal agbenchmark directory + # this is for the ci/cd pipeline + benchmarks_folder_path = HOME_DIRECTORY / "agent" / AGENT_NAME / "agbenchmark" + + CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( + benchmarks_folder_path + ) + + # we want to run the agent from the submodule + HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME + + elif AGENT_NAME and not os.path.join("Auto-GPT-Benchmarks", "agent") in str( + HOME_DIRECTORY + ): + # if the agent name is defined but the run is not from the agent repo, then home is the agent repo + # used for development of both a benchmark and an agent + HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME + benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark" + + CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( + benchmarks_folder_path + ) + else: + # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo) + # used when its just a pip install + CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( + benchmarks_folder_path + ) + + if not benchmarks_folder_path.exists(): + benchmarks_folder_path.mkdir(exist_ok=True) + + return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH diff --git a/agent/Auto-GPT b/agent/Auto-GPT index 62ad7aa8c..114c484b5 160000 --- a/agent/Auto-GPT +++ b/agent/Auto-GPT @@ -1 +1 @@ -Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16 +Subproject commit 114c484b5cfe9a69a74ddcc00025d4a126f54120 diff --git a/agent/SuperAGI b/agent/SuperAGI index f880b2464..ae3b89a32 160000 --- a/agent/SuperAGI +++ b/agent/SuperAGI @@ -1 +1 @@ -Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b +Subproject commit ae3b89a325994c9dda74b5de39d6f7c48010270f diff --git a/agent/gpt-engineer b/agent/gpt-engineer index 9bb81041a..a1d9673f8 160000 --- a/agent/gpt-engineer +++ b/agent/gpt-engineer @@ -1 +1 @@ -Subproject commit 9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36 +Subproject commit a1d9673f82ffce89a9b437e1b54d2e068160819d diff --git a/agent/mini-agi b/agent/mini-agi index 0f8eba95d..bb02bf0d5 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9 +Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011 diff --git a/agent/smol-developer b/agent/smol-developer index a23d01369..bec01917a 160000 --- a/agent/smol-developer +++ b/agent/smol-developer @@ -1 +1 @@ -Subproject commit a23d01369cea976e80b7889fdbf1096619471301 +Subproject commit bec01917a9fa6e7bd73e4d14b328dba468cae495 -- cgit v1.2.3 From dffc1dfd51ebe313d6b20e90a765d538a04f8e4b Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 17 Jul 2023 09:39:24 -0400 Subject: internal_info.json dynamic changes (#163) --- agbenchmark/ReportManager.py | 16 +++++-- agbenchmark/conftest.py | 22 +++++++-- agbenchmark/internal_info.json | 83 ---------------------------------- agbenchmark/reports/internal_info.json | 40 ++++++++++++++++ agbenchmark/utils.py | 9 +++- 5 files changed, 77 insertions(+), 93 deletions(-) delete mode 100644 agbenchmark/internal_info.json create mode 100644 agbenchmark/reports/internal_info.json diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py index cae13595a..202574f9f 100644 --- a/agbenchmark/ReportManager.py +++ b/agbenchmark/ReportManager.py @@ -3,7 +3,7 @@ import os import sys import time from datetime import datetime -from typing import Any, Dict +from typing import Any, Dict, Optional from agbenchmark.utils import get_highest_success_difficulty @@ -37,8 +37,18 @@ class ReportManager: with open(self.filename, "w") as f: json.dump(self.tests, f, indent=4) - def add_test(self, test_name: str, test_details: dict | list) -> None: - self.tests[test_name] = test_details + def add_test( + self, + test_name: str, + test_details: dict | list, + agent_name: Optional[str] = None, + ) -> None: + if agent_name: + if agent_name not in self.tests: + self.tests[agent_name] = {} + self.tests[agent_name][test_name] = test_details + else: + self.tests[test_name] = test_details self.save() diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 245df485e..4a62af0b5 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -15,7 +15,7 @@ from agbenchmark.start_benchmark import ( REGRESSION_TESTS_PATH, get_regression_data, ) -from agbenchmark.utils import calculate_success_percentage +from agbenchmark.utils import AGENT_NAME, calculate_success_percentage def resolve_workspace(workspace: str) -> str: @@ -128,9 +128,10 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH) # user facing reporting information info_manager = ReportManager(INFO_TESTS_PATH) -INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py +INTERNAL_LOGS_PATH = Path(__file__).resolve().parent / "reports" + # internal db step in replacement track pass/fail rate -internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json")) +internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json")) def pytest_runtest_makereport(item: Any, call: Any) -> None: @@ -171,11 +172,22 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: regression_manager.remove_test(test_name) info_details["metrics"]["fail_reason"] = str(call.excinfo.value) - prev_test_results: list[bool] = internal_info.tests.get(test_name, []) + prev_test_results: list[bool] + agent_tests: dict[str, list[bool]] = {} + + # if the structure is nested inside of the agent name + if AGENT_NAME: + agent_tests = internal_info.tests.get(AGENT_NAME, {}) + + if agent_tests: + prev_test_results = agent_tests.get(test_name, []) + else: + prev_test_results = internal_info.tests.get(test_name, []) + if not mock: # only add if it's an actual test prev_test_results.append(info_details["metrics"]["success"]) - internal_info.add_test(test_name, prev_test_results) + internal_info.add_test(test_name, prev_test_results, AGENT_NAME) # can calculate success rate regardless of mock info_details["metrics"]["success_%"] = calculate_success_percentage( diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json deleted file mode 100644 index 0e34ad7a3..000000000 --- a/agbenchmark/internal_info.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "TestBasicMemory": [ - true, - true, - true - ], - "TestBasicRetrieval": [ - true, - true, - true - ], - "TestCreateSimpleWebServer": [ - false, - false, - false - ], - "TestDebugSimpleTypoWithGuidance": [ - false, - false, - false, - false, - false - ], - "TestDebugSimpleTypoWithoutGuidance": [ - false, - false, - false - ], - "TestReadFile": [ - true, - true, - true, - true - ], - "TestRememberMultipleIds": [ - true, - true, - true - ], - "TestRememberMultipleIdsWithNoise": [ - true, - true, - true - ], - "TestRememberMultiplePhrasesWithNoise": [ - true, - true, - true - ], - "TestRetrieval2": [ - true, - true, - true - ], - "TestRetrieval3": [ - true, - true, - true - ], - "TestSearch": [ - true, - true, - true, - true - ], - "TestWriteFile": [ - true, - true, - true, - false, - false, - false, - false, - true, - false, - true, - false, - false, - false, - false, - true - ] -} \ No newline at end of file diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json new file mode 100644 index 000000000..97b525c0f --- /dev/null +++ b/agbenchmark/reports/internal_info.json @@ -0,0 +1,40 @@ +{ + "mini-agi": { + "TestBasicMemory": [true, true, true], + "TestBasicRetrieval": [true, true, true], + "TestCreateSimpleWebServer": [false, false, false], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false, + false, + false, + false + ], + "TestDebugSimpleTypoWithoutGuidance": [false, false, false], + "TestReadFile": [true, true, true, true], + "TestRememberMultipleIds": [true, true, true], + "TestRememberMultipleIdsWithNoise": [true, true, true], + "TestRememberMultiplePhrasesWithNoise": [true, true, true], + "TestRetrieval2": [true, true, true], + "TestRetrieval3": [true, true, true], + "TestSearch": [true, true, true, true], + "TestWriteFile": [ + true, + true, + true, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true + ] + } +} diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index c69509c70..e99a1fa05 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -17,7 +17,6 @@ HOME_ENV = os.getenv("HOME_ENV") def calculate_info_test_path(reports_path: Path) -> str: - print("reports_pathreports_pathreports_pathreports_path", reports_path) if not reports_path.exists(): reports_path.mkdir(parents=True, exist_ok=True) return str( @@ -129,6 +128,7 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]: CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths( benchmarks_folder_path ) + else: # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo) # used when its just a pip install @@ -139,4 +139,9 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]: if not benchmarks_folder_path.exists(): benchmarks_folder_path.mkdir(exist_ok=True) - return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH + return ( + HOME_DIRECTORY, + CONFIG_PATH, + REGRESSION_TESTS_PATH, + INFO_TESTS_PATH, + ) -- cgit v1.2.3 From 8aa6452cc4c76610597ae56f90d5af91170cd1eb Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 17 Jul 2023 11:24:16 -0400 Subject: file naming when --test (#164) --- agbenchmark/reports/internal_info.json | 110 +++++++++++++-------- .../reports/mini-agi/1.1_TestWriteFile.json | 36 +++++++ agbenchmark/reports/mini-agi/1_TestWriteFIle.json | 27 +++++ agbenchmark/reports/mini-agi/2.1_TestReadFile.json | 27 +++++ agbenchmark/reports/mini-agi/2_TestReadFile.json | 27 +++++ agbenchmark/reports/mini-agi/3_TestSearch.json | 27 +++++ .../4.1_TestDebugSimpleTypoWithGuidance.json | 28 ++++++ .../4_TestDebugSimpleTypoWithGuidance.json | 28 ++++++ .../reports/mini-agi/file1_07-16-13-07.json | 23 ----- agbenchmark/utils.py | 52 ++++++++-- agent/mini-agi | 2 +- 11 files changed, 315 insertions(+), 72 deletions(-) create mode 100644 agbenchmark/reports/mini-agi/1.1_TestWriteFile.json create mode 100644 agbenchmark/reports/mini-agi/1_TestWriteFIle.json create mode 100644 agbenchmark/reports/mini-agi/2.1_TestReadFile.json create mode 100644 agbenchmark/reports/mini-agi/2_TestReadFile.json create mode 100644 agbenchmark/reports/mini-agi/3_TestSearch.json create mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json delete mode 100644 agbenchmark/reports/mini-agi/file1_07-16-13-07.json diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json index 97b525c0f..0bfad744a 100644 --- a/agbenchmark/reports/internal_info.json +++ b/agbenchmark/reports/internal_info.json @@ -1,40 +1,72 @@ { - "mini-agi": { - "TestBasicMemory": [true, true, true], - "TestBasicRetrieval": [true, true, true], - "TestCreateSimpleWebServer": [false, false, false], - "TestDebugSimpleTypoWithGuidance": [ - false, - false, - false, - false, - false, - false - ], - "TestDebugSimpleTypoWithoutGuidance": [false, false, false], - "TestReadFile": [true, true, true, true], - "TestRememberMultipleIds": [true, true, true], - "TestRememberMultipleIdsWithNoise": [true, true, true], - "TestRememberMultiplePhrasesWithNoise": [true, true, true], - "TestRetrieval2": [true, true, true], - "TestRetrieval3": [true, true, true], - "TestSearch": [true, true, true, true], - "TestWriteFile": [ - true, - true, - true, - false, - false, - false, - false, - true, - false, - true, - false, - false, - false, - false, - true - ] - } -} + "mini-agi": { + "TestBasicMemory": [ + true, + true, + true + ], + "TestBasicRetrieval": [ + true, + true, + true + ], + "TestCreateSimpleWebServer": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false + ], + "TestDebugSimpleTypoWithoutGuidance": [ + false, + false, + false + ], + "TestReadFile": [ + true, + true, + true, + true, + true + ], + "TestRememberMultipleIds": [ + true, + true, + true + ], + "TestRememberMultipleIdsWithNoise": [ + true, + true, + true + ], + "TestRememberMultiplePhrasesWithNoise": [ + true, + true, + true + ], + "TestRetrieval2": [ + true, + true, + true + ], + "TestRetrieval3": [ + true, + true, + true + ], + "TestSearch": [ + true, + true, + true, + true + ], + "TestWriteFile": [ + true, + true, + true + ] + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json new file mode 100644 index 000000000..637c2d5c5 --- /dev/null +++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json @@ -0,0 +1,36 @@ +{ + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "non_mock_success_%": 100.0, + "run_time": "0.009 seconds" + } + }, + "additional": { + "model": "gpt-3.5-turbo" + }, + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-17-09:54", + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "metrics": { + "run_time": "22.36 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 40.0, + "run_time": "22.169 seconds" + } + } + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json new file mode 100644 index 000000000..e64783190 --- /dev/null +++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-15-22:13", + "metrics": { + "run_time": "12.4 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 50.0, + "run_time": "12.127 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json new file mode 100644 index 000000000..b5d73af99 --- /dev/null +++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestReadFile", + "completion_time": "2023-07-17-10:12", + "metrics": { + "run_time": "65.27 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "65.074 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4", + "reached_termination_time": true + } +} diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json new file mode 100644 index 000000000..869eaaac1 --- /dev/null +++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestReadFile", + "completion_time": "2023-07-15-22:13", + "metrics": { + "run_time": "31.2 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "30.903 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json new file mode 100644 index 000000000..d9d05db4a --- /dev/null +++ b/agbenchmark/reports/mini-agi/3_TestSearch.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestSearch", + "completion_time": "2023-07-15-22:14", + "metrics": { + "run_time": "16.88 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "16.572 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 000000000..d72d599d8 --- /dev/null +++ b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-15-22:16", + "metrics": { + "run_time": "45.92 seconds", + "highest_difficulty": ": 0" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "45.599 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 000000000..7985a7843 --- /dev/null +++ b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-15-22:15", + "metrics": { + "run_time": "32.99 seconds", + "highest_difficulty": ": 0" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "32.582 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}", + "entry_path": "agbenchmark.benchmarks" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json deleted file mode 100644 index 78bafc5f1..000000000 --- a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "command": "agbenchmark start --test TestWriteFile", - "completion_time": "2023-07-16-13:07", - "metrics": { - "run_time": "13.91 seconds", - "highest_difficulty": "interface: 1" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "success_%": 30.0, - "run_time": "13.684 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}" - } -} \ No newline at end of file diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index e99a1fa05..5f1bb30da 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -1,7 +1,9 @@ # radio charts, logs, helper functions for tests, anything else relevant. import glob +import math import os import re +import sys from datetime import datetime from pathlib import Path from typing import Any @@ -17,17 +19,49 @@ HOME_ENV = os.getenv("HOME_ENV") def calculate_info_test_path(reports_path: Path) -> str: + command = sys.argv + if not reports_path.exists(): reports_path.mkdir(parents=True, exist_ok=True) - return str( - reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json" - ) - else: - json_files = glob.glob(str(reports_path / "*.json")) - file_count = len(json_files) - run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" - new_file_path = reports_path / run_name - return str(new_file_path) + + json_files = glob.glob(str(reports_path / "*.json")) + + # Default naming scheme + file_count = len(json_files) + run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json" + + # # If "--test" is in command + if "--test" in command: + test_index = command.index("--test") + try: + test_arg = command[test_index + 1] # Argument after --test + except IndexError: + raise ValueError("Expected an argument after --test") + + # Get all files that include the string that is the argument after --test + related_files = [f for f in json_files if test_arg in f] + related_file_count = len(related_files) + + # Determine the prefix based on the existing files + if related_file_count == 0: + # Try to find the highest prefix number among all files, then increment it + all_prefix_numbers = [] + for f in json_files: + number = float(Path(f).stem.split("_")[0]) + all_prefix_numbers.append(math.floor(number)) + + max_prefix = max(all_prefix_numbers, default=0) + print("HEY WE ARE HERE BIG DAWG", max_prefix) + run_name = f"{max_prefix + 1}_{test_arg}.json" + else: + # Take the number from before the _ and add the .{number} + prefix_str = Path(related_files[0]).stem.rsplit("_", 1)[0].split(".")[0] + prefix = math.floor(float(prefix_str)) + run_name = f"{prefix}.{related_file_count}_{test_arg}.json" + + print("run_namerun_namerun_name", run_name) + new_file_path = reports_path / run_name + return str(new_file_path) def replace_backslash(value: Any) -> Any: diff --git a/agent/mini-agi b/agent/mini-agi index bb02bf0d5..0a9fcd8c3 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011 +Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d -- cgit v1.2.3 From 515742ee61387593e0c6b21b15e92e35ead78a09 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Mon, 17 Jul 2023 19:11:55 +0200 Subject: Fix loading the plugins config (#5000) --- autogpt/config/config.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/autogpt/config/config.py b/autogpt/config/config.py index cb3f26d3e..b41ff68a1 100644 --- a/autogpt/config/config.py +++ b/autogpt/config/config.py @@ -290,11 +290,6 @@ class ConfigBuilder(Configurable[Config]): config_dict["plugins_allowlist"] = _safe_split(os.getenv("ALLOWLISTED_PLUGINS")) config_dict["plugins_denylist"] = _safe_split(os.getenv("DENYLISTED_PLUGINS")) - config_dict["plugins_config"] = PluginsConfig.load_config( - config_dict["plugins_config_file"], - config_dict["plugins_denylist"], - config_dict["plugins_allowlist"], - ) with contextlib.suppress(TypeError): config_dict["image_size"] = int(os.getenv("IMAGE_SIZE")) @@ -318,7 +313,17 @@ class ConfigBuilder(Configurable[Config]): k: v for k, v in config_dict.items() if v is not None } - return cls.build_agent_configuration(config_dict_without_none_values) + config = cls.build_agent_configuration(config_dict_without_none_values) + + # Set secondary config variables (that depend on other config variables) + + config.plugins_config = PluginsConfig.load_config( + config.plugins_config_file, + config.plugins_denylist, + config.plugins_allowlist, + ) + + return config @classmethod def load_azure_config(cls, config_file: str = AZURE_CONFIG_FILE) -> Dict[str, str]: -- cgit v1.2.3 From a758acef2cf12b206d7172b47880dd876f8ad4bc Mon Sep 17 00:00:00 2001 From: Sohrab Saran Date: Mon, 17 Jul 2023 23:54:47 +0530 Subject: Fix `execute_python_file` workspace mount & Windows path formatting (#4996) * fix for #4975 * Add TODO based on code comment. * Use builtin `Path.as_posix()` * Remove TODO --------- Co-authored-by: Reinier van der Leer --- autogpt/commands/execute_code.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/autogpt/commands/execute_code.py b/autogpt/commands/execute_code.py index 2403b2ba5..fb4cb70ea 100644 --- a/autogpt/commands/execute_code.py +++ b/autogpt/commands/execute_code.py @@ -145,11 +145,14 @@ def execute_python_file(filename: str, agent: Agent) -> str: logger.debug(f"Running {file_path} in a {image_name} container...") container: DockerContainer = client.containers.run( image_name, - ["python", str(file_path.relative_to(agent.workspace.root))], + [ + "python", + file_path.relative_to(agent.workspace.root).as_posix(), + ], volumes={ agent.config.workspace_path: { "bind": "/workspace", - "mode": "ro", + "mode": "rw", } }, working_dir="/workspace", -- cgit v1.2.3 From d76317fbf38945df2aa65e6d1fc26acae3739ead Mon Sep 17 00:00:00 2001 From: Luke <2609441+lc0rp@users.noreply.github.com> Date: Mon, 17 Jul 2023 20:11:30 -0400 Subject: Update BULLETIN.md and version numbers (#5002) Co-authored-by: lc0rp <2609411+lc0rp@users.noreply.github.com> --- BULLETIN.md | 29 +++++++++++++---------------- pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/BULLETIN.md b/BULLETIN.md index 117a436a8..a857a7ce1 100644 --- a/BULLETIN.md +++ b/BULLETIN.md @@ -4,26 +4,23 @@ 📖 *User Guide*: https://docs.agpt.co. 👩 *Contributors Wiki*: https://github.com/Significant-Gravitas/Auto-GPT/wiki/Contributing. -# v0.4.4 RELEASE HIGHLIGHTS! 🚀 +# v0.4.5 RELEASE HIGHLIGHTS! 🚀 # ----------------------------- -## GPT-4 is back! -Following OpenAI's recent GPT-4 GA announcement, the SMART_LLM .env setting -now defaults to GPT-4, and Auto-GPT will use GPT-4 by default in its main loop. +This release includes under-the-hood improvements and bug fixes, such as more +accurate token counts for OpenAI functions, faster CI builds, improved plugin +handling, and refactoring of the Config class for better maintainability. -### !! High Costs Warning !! 💰💀🚨 -GPT-4 costs ~20x more than GPT-3.5-turbo. -Please take note of this before using SMART_LLM. You can use `--gpt3only` -or `--gpt4only` to force the use of GPT-3.5-turbo or GPT-4, respectively, -at runtime. +We have also released some documentation updates, including: -## Re-arch v1 preview release! -We've released a preview version of the re-arch code, under `autogpt/core`. -This is a major milestone for us, and we're excited to continue working on it. -We look forward to your feedback. Follow the process here: -https://github.com/Significant-Gravitas/Auto-GPT/issues/4770. +- *How to share your system logs* + Visit [docs/share-your-logs.md] to learn how to how to share logs with us + via a log analyzer graciously contributed by https://www.e2b.dev/ -## Other highlights -Other fixes include plugins regressions, Azure config and security patches. +- *Auto-GPT re-architecture documentation* + You can learn more about the inner-workings of the Auto-GPT re-architecture + released last cycle, via these links: + * [autogpt/core/README.md] + * [autogpt/core/ARCHITECTURE_NOTES.md] Take a look at the Release Notes on Github for the full changelog! https://github.com/Significant-Gravitas/Auto-GPT/releases. diff --git a/pyproject.toml b/pyproject.toml index 06b2f87f8..f16ee501f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agpt" -version = "0.4.4" +version = "0.4.5" authors = [ { name="Torantulino", email="support@agpt.co" }, ] -- cgit v1.2.3 From 2d8fa5ca6f26f5c8b36d7d4e84187e9a0bea81dc Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Mon, 17 Jul 2023 17:15:10 -0700 Subject: Use report location (#165) --- .github/workflows/ci.yml | 8 ++++---- agbenchmark/utils.py | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a2224ea78..3b0dc50fe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,7 +8,7 @@ on: push: branches: [master, ci-test*] paths-ignore: - - 'benchmark_runs/**' + - 'reports/**' pull_request: branches: [stable, master, release-*] @@ -167,7 +167,7 @@ jobs: REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }} HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }} - REPORT_LOCATION: ${{ matrix.cache-enabled == false && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }} + REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }} - name: Upload reports @@ -175,7 +175,7 @@ jobs: uses: actions/upload-artifact@v3 with: name: ${{ matrix.agent-name }} - path: benchmark_runs/${{ matrix.agent-name }} + path: reports/${{ matrix.agent-name }} - name: Authenticate and Push to Branch if: (success() || failure()) && (github.event_name != 'pull_request' && matrix.cache-enabled == false) @@ -183,7 +183,7 @@ jobs: git config --global user.email "github-bot@agpt.co" git config --global user.name "Auto-GPT-Bot" - git add benchmark_runs/* || echo "nothing to commit" + git add reports/* || echo "nothing to commit" commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')" git commit -m "${commit_message}" diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index 5f1bb30da..f1ed43639 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -19,6 +19,10 @@ HOME_ENV = os.getenv("HOME_ENV") def calculate_info_test_path(reports_path: Path) -> str: + report_location = os.getenv("REPORT_LOCATION", ".") + if report_location: + reports_path = Path(os.getcwd()) / report_location + command = sys.argv if not reports_path.exists(): -- cgit v1.2.3 From ed5fd3416ac48b6b02a0497d983675c0486f70ee Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 00:17:59 +0000 Subject: smol-developer-20230718001759 --- reports/smol-developer/file1_07-18-00-17.json | 176 ++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 reports/smol-developer/file1_07-18-00-17.json diff --git a/reports/smol-developer/file1_07-18-00-17.json b/reports/smol-developer/file1_07-18-00-17.json new file mode 100644 index 000000000..1842163b5 --- /dev/null +++ b/reports/smol-developer/file1_07-18-00-17.json @@ -0,0 +1,176 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-00:17", + "metrics": { + "run_time": "41.3 seconds", + "highest_difficulty": "novice: 3" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "5.554 seconds" + } + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d4", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 100.0, + "run_time": "8.223 seconds" + } + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "16.099 seconds" + } + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "9.624 seconds" + } + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.625 seconds" + } + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + } + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIdsWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From cf14609b518c489c4c62085c6a46f993c2268595 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 00:22:09 +0000 Subject: Auto-GPT-20230718002209 --- reports/Auto-GPT/file1_07-18-00-18.json | 177 ++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 reports/Auto-GPT/file1_07-18-00-18.json diff --git a/reports/Auto-GPT/file1_07-18-00-18.json b/reports/Auto-GPT/file1_07-18-00-18.json new file mode 100644 index 000000000..aa693304d --- /dev/null +++ b/reports/Auto-GPT/file1_07-18-00-18.json @@ -0,0 +1,177 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-00:22", + "metrics": { + "run_time": "239.83 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "35.666 seconds" + } + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d4", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "15.512 seconds" + } + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0, 0.0]", + "success_%": 0.0, + "run_time": "126.148 seconds" + } + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "62.169 seconds" + } + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIdsWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From c7a5498f0f45c015e48a013cc172682b86e5b13a Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 00:25:27 +0000 Subject: gpt-engineer-20230718002527 --- reports/gpt-engineer/file1_07-18-00-17.json | 173 ++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 reports/gpt-engineer/file1_07-18-00-17.json diff --git a/reports/gpt-engineer/file1_07-18-00-17.json b/reports/gpt-engineer/file1_07-18-00-17.json new file mode 100644 index 000000000..d7d3c1ee6 --- /dev/null +++ b/reports/gpt-engineer/file1_07-18-00-17.json @@ -0,0 +1,173 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-00:25", + "metrics": { + "run_time": "493.76 seconds", + "highest_difficulty": "intermediate: 4" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "66.807 seconds" + } + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d4", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 100.0, + "run_time": "84.302 seconds" + } + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "74.761 seconds" + } + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'", + "success_%": 0.0, + "run_time": "45.324 seconds" + } + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": true, + "success_%": 100.0, + "run_time": "86.25 seconds" + } + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "76.728 seconds" + } + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0, 0.0]", + "success_%": 0.0, + "run_time": "59.412 seconds" + } + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d3", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2", + "is_regression": false, + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultipleIdsWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4", + "is_regression": false, + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + } + } + }, + "config": { + "workspace": "projects/my-new-project/workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 12c5d545837b5256f34695820601f1797b489703 Mon Sep 17 00:00:00 2001 From: Silen Naihin Date: Mon, 17 Jul 2023 22:41:58 -0400 Subject: Fixing memory challenges, naming, testing mini-agi, smooth retrieval scaling (#166) --- agbenchmark/challenge.py | 11 +-- .../challenges/code/d1/artifacts_in/__init__.py | 0 .../challenges/code/d1/artifacts_in/code.py | 13 ---- .../challenges/code/d1/artifacts_in/test.py | 31 --------- .../challenges/code/d1/artifacts_out/__init__.py | 0 .../challenges/code/d1/artifacts_out/code.py | 12 ---- .../challenges/code/d1/artifacts_out/test.py | 31 --------- agbenchmark/challenges/code/d1/data.json | 19 ------ .../code/d1_debug/artifacts_in/__init__.py | 0 .../challenges/code/d1_debug/artifacts_in/code.py | 13 ++++ .../challenges/code/d1_debug/artifacts_in/test.py | 31 +++++++++ .../code/d1_debug/artifacts_out/__init__.py | 0 .../challenges/code/d1_debug/artifacts_out/code.py | 12 ++++ .../challenges/code/d1_debug/artifacts_out/test.py | 31 +++++++++ agbenchmark/challenges/code/d1_debug/data.json | 19 ++++++ .../challenges/code/d2/artifacts_in/__init__.py | 0 .../challenges/code/d2/artifacts_in/code.py | 13 ---- .../challenges/code/d2/artifacts_in/test.py | 31 --------- .../challenges/code/d2/artifacts_out/__init__.py | 0 .../challenges/code/d2/artifacts_out/code.py | 12 ---- .../challenges/code/d2/artifacts_out/test.py | 31 --------- agbenchmark/challenges/code/d2/data.json | 19 ------ .../code/d2_vague/artifacts_in/__init__.py | 0 .../challenges/code/d2_vague/artifacts_in/code.py | 13 ++++ .../challenges/code/d2_vague/artifacts_in/test.py | 31 +++++++++ .../code/d2_vague/artifacts_out/__init__.py | 0 .../challenges/code/d2_vague/artifacts_out/code.py | 12 ++++ .../challenges/code/d2_vague/artifacts_out/test.py | 31 +++++++++ agbenchmark/challenges/code/d2_vague/data.json | 19 ++++++ .../challenges/code/d3/custom_python/api_tests.py | 41 ----------- agbenchmark/challenges/code/d3/data.json | 19 ------ .../code/d3_two_sum/artifacts_out/__init__.py | 0 .../code/d3_two_sum/artifacts_out/code.py | 12 ++++ .../code/d3_two_sum/custom_python/test.py | 31 +++++++++ agbenchmark/challenges/code/d3_two_sum/data.json | 19 ++++++ .../challenges/code/d4/artifacts_out/__init__.py | 0 .../challenges/code/d4/artifacts_out/code.py | 12 ---- .../challenges/code/d4/custom_python/test.py | 31 --------- agbenchmark/challenges/code/d4/data.json | 19 ------ .../code/d4_web_server/custom_python/api_tests.py | 41 +++++++++++ .../challenges/code/d4_web_server/data.json | 19 ++++++ .../challenges/code/d5/artifacts_out/__init__.py | 0 .../challenges/code/d5/artifacts_out/code.py | 23 ------- .../challenges/code/d5/custom_python/test.py | 31 --------- agbenchmark/challenges/code/d5/data.json | 19 ------ .../code/d5_three_sum/artifacts_out/__init__.py | 0 .../code/d5_three_sum/artifacts_out/code.py | 23 +++++++ .../code/d5_three_sum/custom_python/test.py | 31 +++++++++ agbenchmark/challenges/code/d5_three_sum/data.json | 19 ++++++ .../memory/m1/artifacts_in/instructions_1.txt | 2 - .../memory/m1/artifacts_in/instructions_2.txt | 1 - .../memory/m1/artifacts_in/instructions_3.txt | 1 - .../memory/m1/artifacts_in/instructions_4.txt | 1 - .../memory/m1/artifacts_in/instructions_5.txt | 1 - .../memory/m1/artifacts_out/random_file.txt | 1 - agbenchmark/challenges/memory/m1/data.json | 19 ------ .../memory/m1_id/artifacts_in/instructions_1.txt | 2 + .../memory/m1_id/artifacts_in/instructions_2.txt | 1 + .../memory/m1_id/artifacts_in/instructions_3.txt | 1 + .../memory/m1_id/artifacts_in/instructions_4.txt | 1 + .../memory/m1_id/artifacts_in/instructions_5.txt | 1 + .../memory/m1_id/artifacts_out/result.txt | 1 + agbenchmark/challenges/memory/m1_id/data.json | 19 ++++++ .../memory/m2/artifacts_in/instructions_1.txt | 1 - .../memory/m2/artifacts_in/instructions_2.txt | 1 - .../memory/m2/artifacts_in/instructions_3.txt | 1 - .../memory/m2/artifacts_in/instructions_4.txt | 1 - .../memory/m2/artifacts_in/instructions_5.txt | 1 - .../memory/m2/artifacts_out/random_file.txt | 4 -- agbenchmark/challenges/memory/m2/data.json | 19 ------ .../m2_multiple/artifacts_in/instructions_1.txt | 1 + .../m2_multiple/artifacts_in/instructions_2.txt | 1 + .../m2_multiple/artifacts_in/instructions_3.txt | 1 + .../m2_multiple/artifacts_in/instructions_4.txt | 1 + .../m2_multiple/artifacts_in/instructions_5.txt | 1 + .../memory/m2_multiple/artifacts_out/result.txt | 4 ++ .../challenges/memory/m2_multiple/data.json | 19 ++++++ .../memory/m3/artifacts_in/instructions_1.txt | 5 -- .../memory/m3/artifacts_in/instructions_2.txt | 5 -- .../memory/m3/artifacts_in/instructions_3.txt | 5 -- .../memory/m3/artifacts_in/instructions_4.txt | 5 -- .../memory/m3/artifacts_in/instructions_5.txt | 5 -- .../memory/m3/artifacts_out/random_file.txt | 4 -- agbenchmark/challenges/memory/m3/data.json | 19 ------ .../m3_noise/artifacts_in/instructions_1.txt | 5 ++ .../m3_noise/artifacts_in/instructions_2.txt | 5 ++ .../m3_noise/artifacts_in/instructions_3.txt | 5 ++ .../m3_noise/artifacts_in/instructions_4.txt | 5 ++ .../m3_noise/artifacts_in/instructions_5.txt | 5 ++ .../memory/m3_noise/artifacts_out/result.txt | 4 ++ agbenchmark/challenges/memory/m3_noise/data.json | 19 ++++++ .../memory/m4/artifacts_in/instructions_1.txt | 5 -- .../memory/m4/artifacts_in/instructions_2.txt | 5 -- .../memory/m4/artifacts_in/instructions_3.txt | 5 -- .../memory/m4/artifacts_in/instructions_4.txt | 5 -- .../memory/m4/artifacts_in/instructions_5.txt | 5 -- .../memory/m4/artifacts_out/random_file.txt | 4 -- agbenchmark/challenges/memory/m4/data.json | 24 ------- .../m4_phrases/artifacts_in/instructions_1.txt | 5 ++ .../m4_phrases/artifacts_in/instructions_2.txt | 5 ++ .../m4_phrases/artifacts_in/instructions_3.txt | 5 ++ .../m4_phrases/artifacts_in/instructions_4.txt | 5 ++ .../m4_phrases/artifacts_in/instructions_5.txt | 5 ++ .../memory/m4_phrases/artifacts_out/result.txt | 4 ++ agbenchmark/challenges/memory/m4_phrases/data.json | 24 +++++++ .../retrieval/r1/artifacts_out/random_file.txt | 1 - agbenchmark/challenges/retrieval/r1/data.json | 19 ------ .../r1_book_price/artifacts_out/random_file.txt | 1 + .../challenges/retrieval/r1_book_price/data.json | 19 ++++++ .../r2.1_specific/artifacts_out/random_file.txt | 1 + .../challenges/retrieval/r2.1_specific/data.json | 19 ++++++ .../r2.2_formatting/artifacts_out/random_file.txt | 1 + .../challenges/retrieval/r2.2_formatting/data.json | 19 ++++++ .../retrieval/r2/artifacts_out/random_file.txt | 1 - agbenchmark/challenges/retrieval/r2/data.json | 19 ------ .../r2_tesla_revenue/artifacts_out/random_file.txt | 1 + .../retrieval/r2_tesla_revenue/data.json | 19 ++++++ agbenchmark/challenges/retrieval/r3/data.json | 2 +- agbenchmark/conftest.py | 9 ++- agbenchmark/reports/internal_info.json | 79 +++++++++++++--------- .../reports/mini-agi/1.1_TestWriteFile.json | 57 +++++++--------- .../10.1_TestRememberMultipleWithNoise.json | 30 ++++++++ .../mini-agi/10_TestRememberMultipleWithNoise.json | 31 +++++++++ .../11.1_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++ .../11.2_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++ .../11.3_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++ .../11.4_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++ .../11.5_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++ .../11_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++ .../12.1_TestDebugSimpleTypoWithGuidance.json | 28 ++++++++ .../12.2_TestDebugSimpleTypoWithGuidance.json | 28 ++++++++ .../12.3_TestDebugSimpleTypoWithGuidance.json | 28 ++++++++ .../12_TestDebugSimpleTypoWithGuidance.json | 31 +++++++++ agbenchmark/reports/mini-agi/1_TestWriteFIle.json | 4 +- agbenchmark/reports/mini-agi/2.1_TestReadFile.json | 4 +- agbenchmark/reports/mini-agi/2_TestReadFile.json | 4 +- agbenchmark/reports/mini-agi/3.1_TestSearch.json | 27 ++++++++ agbenchmark/reports/mini-agi/3_TestSearch.json | 4 +- .../reports/mini-agi/4.1_TestBasicRetrieval.json | 27 ++++++++ .../4.1_TestDebugSimpleTypoWithGuidance.json | 28 -------- .../reports/mini-agi/4_TestBasicRetrieval.json | 27 ++++++++ .../4_TestDebugSimpleTypoWithGuidance.json | 28 -------- .../reports/mini-agi/5.1_TestRetrieval2.0.json | 30 ++++++++ .../reports/mini-agi/5_TestRetrieval2.0.json | 29 ++++++++ .../reports/mini-agi/6.1_TestRetrieval2.1.json | 30 ++++++++ .../reports/mini-agi/6.2_TestRetrieval2.1.json | 30 ++++++++ .../reports/mini-agi/6.3_TestRetrieval2.1.json | 30 ++++++++ .../reports/mini-agi/6.4_TestRetrieval2.1.json | 31 +++++++++ .../reports/mini-agi/6_TestRetrieval2.1.json | 30 ++++++++ .../reports/mini-agi/7.1_TestRetrieval2.2.json | 31 +++++++++ .../reports/mini-agi/7_TestRetrieval2.2.json | 30 ++++++++ .../reports/mini-agi/8.1_TestBasicMemory.json | 30 ++++++++ .../reports/mini-agi/8_TestBasicMemory.json | 31 +++++++++ .../mini-agi/9.1_TestRememberMultipleIds.json | 30 ++++++++ .../mini-agi/9_TestRememberMultipleIds.json | 31 +++++++++ agbenchmark/utils.py | 5 +- agent/mini-agi | 2 +- 157 files changed, 1576 insertions(+), 741 deletions(-) delete mode 100644 agbenchmark/challenges/code/d1/artifacts_in/__init__.py delete mode 100644 agbenchmark/challenges/code/d1/artifacts_in/code.py delete mode 100644 agbenchmark/challenges/code/d1/artifacts_in/test.py delete mode 100644 agbenchmark/challenges/code/d1/artifacts_out/__init__.py delete mode 100644 agbenchmark/challenges/code/d1/artifacts_out/code.py delete mode 100644 agbenchmark/challenges/code/d1/artifacts_out/test.py delete mode 100644 agbenchmark/challenges/code/d1/data.json create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_in/code.py create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_in/test.py create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_out/test.py create mode 100644 agbenchmark/challenges/code/d1_debug/data.json delete mode 100644 agbenchmark/challenges/code/d2/artifacts_in/__init__.py delete mode 100644 agbenchmark/challenges/code/d2/artifacts_in/code.py delete mode 100644 agbenchmark/challenges/code/d2/artifacts_in/test.py delete mode 100644 agbenchmark/challenges/code/d2/artifacts_out/__init__.py delete mode 100644 agbenchmark/challenges/code/d2/artifacts_out/code.py delete mode 100644 agbenchmark/challenges/code/d2/artifacts_out/test.py delete mode 100644 agbenchmark/challenges/code/d2/data.json create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_in/code.py create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_in/test.py create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_out/test.py create mode 100644 agbenchmark/challenges/code/d2_vague/data.json delete mode 100644 agbenchmark/challenges/code/d3/custom_python/api_tests.py delete mode 100644 agbenchmark/challenges/code/d3/data.json create mode 100644 agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d3_two_sum/custom_python/test.py create mode 100644 agbenchmark/challenges/code/d3_two_sum/data.json delete mode 100644 agbenchmark/challenges/code/d4/artifacts_out/__init__.py delete mode 100644 agbenchmark/challenges/code/d4/artifacts_out/code.py delete mode 100644 agbenchmark/challenges/code/d4/custom_python/test.py delete mode 100644 agbenchmark/challenges/code/d4/data.json create mode 100644 agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py create mode 100644 agbenchmark/challenges/code/d4_web_server/data.json delete mode 100644 agbenchmark/challenges/code/d5/artifacts_out/__init__.py delete mode 100644 agbenchmark/challenges/code/d5/artifacts_out/code.py delete mode 100644 agbenchmark/challenges/code/d5/custom_python/test.py delete mode 100644 agbenchmark/challenges/code/d5/data.json create mode 100644 agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py create mode 100644 agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py create mode 100644 agbenchmark/challenges/code/d5_three_sum/custom_python/test.py create mode 100644 agbenchmark/challenges/code/d5_three_sum/data.json delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt delete mode 100644 agbenchmark/challenges/memory/m1/data.json create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt create mode 100644 agbenchmark/challenges/memory/m1_id/data.json delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt delete mode 100644 agbenchmark/challenges/memory/m2/data.json create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt create mode 100644 agbenchmark/challenges/memory/m2_multiple/data.json delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt delete mode 100644 agbenchmark/challenges/memory/m3/data.json create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt create mode 100644 agbenchmark/challenges/memory/m3_noise/data.json delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt delete mode 100644 agbenchmark/challenges/memory/m4/data.json create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt create mode 100644 agbenchmark/challenges/memory/m4_phrases/data.json delete mode 100644 agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt delete mode 100644 agbenchmark/challenges/retrieval/r1/data.json create mode 100644 agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r1_book_price/data.json create mode 100644 agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r2.1_specific/data.json create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/data.json delete mode 100644 agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt delete mode 100644 agbenchmark/challenges/retrieval/r2/data.json create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json create mode 100644 agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json create mode 100644 agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/3.1_TestSearch.json create mode 100644 agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json delete mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json delete mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json create mode 100644 agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json create mode 100644 agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json create mode 100644 agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json create mode 100644 agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json create mode 100644 agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json create mode 100644 agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json create mode 100644 agbenchmark/reports/mini-agi/8_TestBasicMemory.json create mode 100644 agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json create mode 100644 agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py index 4f24bb603..cdaebed4f 100644 --- a/agbenchmark/challenge.py +++ b/agbenchmark/challenge.py @@ -103,22 +103,25 @@ class Challenge(ABC): ] def scoring(self, content: str, ground: Ground) -> float: + print("Scoring content: ", content) if ground.should_contain: for should_contain_word in ground.should_contain: if should_contain_word not in content: + print(f"Word that should exist - {should_contain_word}: False") return 0.0 else: - print( - f"Word that should exist: {should_contain_word} exists in the content" - ) + print(f"Word that should exist - {should_contain_word}: True") if ground.should_not_contain: for should_not_contain_word in ground.should_not_contain: if should_not_contain_word in content: + print( + f"Word that should not exist - {should_not_contain_word}: False" + ) return 0.0 else: print( - f"Word that should not exist: {should_not_contain_word} does not exist in the content" + f"Word that should not exist - {should_not_contain_word}: True" ) return 1.0 diff --git a/agbenchmark/challenges/code/d1/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1/artifacts_in/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1/artifacts_in/code.py deleted file mode 100644 index df8120bfa..000000000 --- a/agbenchmark/challenges/code/d1/artifacts_in/code.py +++ /dev/null @@ -1,13 +0,0 @@ -# mypy: ignore-errors -from typing import List, Optional - - -def two_sum(nums: List, target: int) -> Optional[List[int]]: - seen = {} - for i, num in enumerate(nums): - typo - complement = target - num - if complement in seen: - return [seen[complement], i] - seen[num] = i - return None diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1/artifacts_in/test.py deleted file mode 100644 index d85d13537..000000000 --- a/agbenchmark/challenges/code/d1/artifacts_in/test.py +++ /dev/null @@ -1,31 +0,0 @@ -# mypy: ignore-errors -from code import two_sum -from typing import List - - -def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: - result = two_sum(nums, target) - print(result) - assert ( - result == expected_result - ), f"AssertionError: Expected the output to be {expected_result}" - - -if __name__ == "__main__": - # test the trivial case with the first two numbers - nums = [2, 7, 11, 15] - target = 9 - expected_result = [0, 1] - test_two_sum(nums, target, expected_result) - - # test for ability to use zero and the same number twice - nums = [2, 7, 0, 15, 12, 0] - target = 0 - expected_result = [2, 5] - test_two_sum(nums, target, expected_result) - - # test for first and last index usage and negative numbers - nums = [-6, 7, 11, 4] - target = -2 - expected_result = [0, 3] - test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1/artifacts_out/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1/artifacts_out/code.py deleted file mode 100644 index de3d8c62c..000000000 --- a/agbenchmark/challenges/code/d1/artifacts_out/code.py +++ /dev/null @@ -1,12 +0,0 @@ -# mypy: ignore-errors -from typing import List, Optional - - -def two_sum(nums: List, target: int) -> Optional[List[int]]: - seen = {} - for i, num in enumerate(nums): - complement = target - num - if complement in seen: - return [seen[complement], i] - seen[num] = i - return None diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1/artifacts_out/test.py deleted file mode 100644 index d85d13537..000000000 --- a/agbenchmark/challenges/code/d1/artifacts_out/test.py +++ /dev/null @@ -1,31 +0,0 @@ -# mypy: ignore-errors -from code import two_sum -from typing import List - - -def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: - result = two_sum(nums, target) - print(result) - assert ( - result == expected_result - ), f"AssertionError: Expected the output to be {expected_result}" - - -if __name__ == "__main__": - # test the trivial case with the first two numbers - nums = [2, 7, 11, 15] - target = 9 - expected_result = [0, 1] - test_two_sum(nums, target, expected_result) - - # test for ability to use zero and the same number twice - nums = [2, 7, 0, 15, 12, 0] - target = 0 - expected_result = [2, 5] - test_two_sum(nums, target, expected_result) - - # test for first and last index usage and negative numbers - nums = [-6, 7, 11, 4] - target = -2 - expected_result = [0, 3] - test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json deleted file mode 100644 index d8e0280a4..000000000 --- a/agbenchmark/challenges/code/d1/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestDebugSimpleTypoWithGuidance", - "category": ["code", "iterate"], - "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", - "dependencies": ["TestReadFile", "TestWriteFile"], - "cutoff": 60, - "ground": { - "answer": "[0, 1] [2, 5] [0, 3]", - "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], - "should_not_contain": [], - "files": ["test.py"], - "type": "execute_python_code" - }, - "info": { - "difficulty": "basic", - "description": "Tests ability for the agent to debug python code with a simple typo in it.", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py new file mode 100644 index 000000000..df8120bfa --- /dev/null +++ b/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py @@ -0,0 +1,13 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + typo + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py new file mode 100644 index 000000000..de3d8c62c --- /dev/null +++ b/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d1_debug/data.json b/agbenchmark/challenges/code/d1_debug/data.json new file mode 100644 index 000000000..4e2798a37 --- /dev/null +++ b/agbenchmark/challenges/code/d1_debug/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestDebugSimpleTypoWithGuidance", + "category": ["code", "iterate"], + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "dependencies": ["TestReadFile", "TestWriteFile"], + "cutoff": 75, + "ground": { + "answer": "[0, 1] [2, 5] [0, 3]", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2/artifacts_in/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2/artifacts_in/code.py deleted file mode 100644 index df8120bfa..000000000 --- a/agbenchmark/challenges/code/d2/artifacts_in/code.py +++ /dev/null @@ -1,13 +0,0 @@ -# mypy: ignore-errors -from typing import List, Optional - - -def two_sum(nums: List, target: int) -> Optional[List[int]]: - seen = {} - for i, num in enumerate(nums): - typo - complement = target - num - if complement in seen: - return [seen[complement], i] - seen[num] = i - return None diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2/artifacts_in/test.py deleted file mode 100644 index d85d13537..000000000 --- a/agbenchmark/challenges/code/d2/artifacts_in/test.py +++ /dev/null @@ -1,31 +0,0 @@ -# mypy: ignore-errors -from code import two_sum -from typing import List - - -def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: - result = two_sum(nums, target) - print(result) - assert ( - result == expected_result - ), f"AssertionError: Expected the output to be {expected_result}" - - -if __name__ == "__main__": - # test the trivial case with the first two numbers - nums = [2, 7, 11, 15] - target = 9 - expected_result = [0, 1] - test_two_sum(nums, target, expected_result) - - # test for ability to use zero and the same number twice - nums = [2, 7, 0, 15, 12, 0] - target = 0 - expected_result = [2, 5] - test_two_sum(nums, target, expected_result) - - # test for first and last index usage and negative numbers - nums = [-6, 7, 11, 4] - target = -2 - expected_result = [0, 3] - test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2/artifacts_out/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2/artifacts_out/code.py deleted file mode 100644 index de3d8c62c..000000000 --- a/agbenchmark/challenges/code/d2/artifacts_out/code.py +++ /dev/null @@ -1,12 +0,0 @@ -# mypy: ignore-errors -from typing import List, Optional - - -def two_sum(nums: List, target: int) -> Optional[List[int]]: - seen = {} - for i, num in enumerate(nums): - complement = target - num - if complement in seen: - return [seen[complement], i] - seen[num] = i - return None diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2/artifacts_out/test.py deleted file mode 100644 index d85d13537..000000000 --- a/agbenchmark/challenges/code/d2/artifacts_out/test.py +++ /dev/null @@ -1,31 +0,0 @@ -# mypy: ignore-errors -from code import two_sum -from typing import List - - -def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: - result = two_sum(nums, target) - print(result) - assert ( - result == expected_result - ), f"AssertionError: Expected the output to be {expected_result}" - - -if __name__ == "__main__": - # test the trivial case with the first two numbers - nums = [2, 7, 11, 15] - target = 9 - expected_result = [0, 1] - test_two_sum(nums, target, expected_result) - - # test for ability to use zero and the same number twice - nums = [2, 7, 0, 15, 12, 0] - target = 0 - expected_result = [2, 5] - test_two_sum(nums, target, expected_result) - - # test for first and last index usage and negative numbers - nums = [-6, 7, 11, 4] - target = -2 - expected_result = [0, 3] - test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json deleted file mode 100644 index de32ef9a7..000000000 --- a/agbenchmark/challenges/code/d2/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestDebugSimpleTypoWithoutGuidance", - "category": ["code", "iterate"], - "task": "Make test.py run without errors.", - "dependencies": ["TestDebugSimpleTypoWithGuidance"], - "cutoff": 60, - "ground": { - "answer": "[0, 1] [2, 5] [0, 3]", - "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], - "should_not_contain": [], - "files": ["test.py"], - "type": "execute_python_code" - }, - "info": { - "difficulty": "novice", - "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py new file mode 100644 index 000000000..df8120bfa --- /dev/null +++ b/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py @@ -0,0 +1,13 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + typo + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py new file mode 100644 index 000000000..de3d8c62c --- /dev/null +++ b/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d2_vague/data.json b/agbenchmark/challenges/code/d2_vague/data.json new file mode 100644 index 000000000..2b6c3526c --- /dev/null +++ b/agbenchmark/challenges/code/d2_vague/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestDebugSimpleTypoWithoutGuidance", + "category": ["code", "iterate"], + "task": "Make test.py run without errors.", + "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "cutoff": 75, + "ground": { + "answer": "[0, 1] [2, 5] [0, 3]", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "novice", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d3/custom_python/api_tests.py deleted file mode 100644 index f01934ef8..000000000 --- a/agbenchmark/challenges/code/d3/custom_python/api_tests.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -from typing import Any, Dict -from unittest.mock import Mock, patch - -import requests - - -def test_make_request_and_assert() -> None: - result = make_request_and_assert() - print(result) - expected_result = {"status": "OK"} - error_message = f"AssertionError: Expected the output to be {expected_result}" - print(error_message) - assert result == expected_result, error_message - - -def make_assertion() -> None: - if os.environ.get("MOCK_TEST", "False").lower() == "true": - mock_response = Mock(requests.Response) - mock_response.status_code = 200 - mock_response.json.return_value = {"status": "OK"} - - with patch("requests.get", return_value=mock_response): - make_request_and_assert() - else: - make_request_and_assert() - - -def make_request_and_assert() -> Dict[str, Any]: - response = requests.get("http://localhost:8079/health") - if response.status_code != 200: - raise AssertionError( - f"Expected status code 200, but got {response.status_code}" - ) - - return response.json() - - -if __name__ == "__main__": - # test for the case when server is healthy - test_make_request_and_assert() diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json deleted file mode 100644 index c5d111a4d..000000000 --- a/agbenchmark/challenges/code/d3/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestCreateSimpleWebServer", - "category": ["code"], - "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", - "dependencies": ["TestDebugSimpleTypoWithGuidance"], - "cutoff": 60, - "ground": { - "answer": "GET localhost:8079/health responds with a 200 OK", - "should_contain": [], - "should_not_contain": ["AssertionError"], - "files": ["test.py"], - "type": "execute_python_code" - }, - "info": { - "difficulty": "advanced", - "description": "Tests ability for the agent to build a simple web server locally", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py new file mode 100644 index 000000000..de3d8c62c --- /dev/null +++ b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py @@ -0,0 +1,12 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def two_sum(nums: List, target: int) -> Optional[List[int]]: + seen = {} + for i, num in enumerate(nums): + complement = target - num + if complement in seen: + return [seen[complement], i] + seen[num] = i + return None diff --git a/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py b/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py new file mode 100644 index 000000000..d85d13537 --- /dev/null +++ b/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import two_sum +from typing import List + + +def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: + result = two_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first two numbers + nums = [2, 7, 11, 15] + target = 9 + expected_result = [0, 1] + test_two_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 0 + expected_result = [2, 5] + test_two_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = -2 + expected_result = [0, 3] + test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d3_two_sum/data.json b/agbenchmark/challenges/code/d3_two_sum/data.json new file mode 100644 index 000000000..6df083d40 --- /dev/null +++ b/agbenchmark/challenges/code/d3_two_sum/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestBasicCodeGeneration", + "category": ["code"], + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "dependencies": ["TestWriteFile"], + "cutoff": 90, + "ground": { + "answer": "The two_sum function coded properly.", + "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "advanced", + "description": "Tests ability for the agent to create the two_sum function.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d4/artifacts_out/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d4/artifacts_out/code.py deleted file mode 100644 index de3d8c62c..000000000 --- a/agbenchmark/challenges/code/d4/artifacts_out/code.py +++ /dev/null @@ -1,12 +0,0 @@ -# mypy: ignore-errors -from typing import List, Optional - - -def two_sum(nums: List, target: int) -> Optional[List[int]]: - seen = {} - for i, num in enumerate(nums): - complement = target - num - if complement in seen: - return [seen[complement], i] - seen[num] = i - return None diff --git a/agbenchmark/challenges/code/d4/custom_python/test.py b/agbenchmark/challenges/code/d4/custom_python/test.py deleted file mode 100644 index d85d13537..000000000 --- a/agbenchmark/challenges/code/d4/custom_python/test.py +++ /dev/null @@ -1,31 +0,0 @@ -# mypy: ignore-errors -from code import two_sum -from typing import List - - -def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None: - result = two_sum(nums, target) - print(result) - assert ( - result == expected_result - ), f"AssertionError: Expected the output to be {expected_result}" - - -if __name__ == "__main__": - # test the trivial case with the first two numbers - nums = [2, 7, 11, 15] - target = 9 - expected_result = [0, 1] - test_two_sum(nums, target, expected_result) - - # test for ability to use zero and the same number twice - nums = [2, 7, 0, 15, 12, 0] - target = 0 - expected_result = [2, 5] - test_two_sum(nums, target, expected_result) - - # test for first and last index usage and negative numbers - nums = [-6, 7, 11, 4] - target = -2 - expected_result = [0, 3] - test_two_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json deleted file mode 100644 index e8db918d2..000000000 --- a/agbenchmark/challenges/code/d4/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestBasicCodeGeneration", - "category": ["code"], - "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", - "dependencies": ["TestWriteFile"], - "cutoff": 60, - "ground": { - "answer": "The two_sum function coded properly.", - "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"], - "should_not_contain": [], - "files": ["test.py"], - "type": "execute_python_code" - }, - "info": { - "difficulty": "novice", - "description": "Tests ability for the agent to create the two_sum function.", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py b/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py new file mode 100644 index 000000000..f01934ef8 --- /dev/null +++ b/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py @@ -0,0 +1,41 @@ +import os +from typing import Any, Dict +from unittest.mock import Mock, patch + +import requests + + +def test_make_request_and_assert() -> None: + result = make_request_and_assert() + print(result) + expected_result = {"status": "OK"} + error_message = f"AssertionError: Expected the output to be {expected_result}" + print(error_message) + assert result == expected_result, error_message + + +def make_assertion() -> None: + if os.environ.get("MOCK_TEST", "False").lower() == "true": + mock_response = Mock(requests.Response) + mock_response.status_code = 200 + mock_response.json.return_value = {"status": "OK"} + + with patch("requests.get", return_value=mock_response): + make_request_and_assert() + else: + make_request_and_assert() + + +def make_request_and_assert() -> Dict[str, Any]: + response = requests.get("http://localhost:8079/health") + if response.status_code != 200: + raise AssertionError( + f"Expected status code 200, but got {response.status_code}" + ) + + return response.json() + + +if __name__ == "__main__": + # test for the case when server is healthy + test_make_request_and_assert() diff --git a/agbenchmark/challenges/code/d4_web_server/data.json b/agbenchmark/challenges/code/d4_web_server/data.json new file mode 100644 index 000000000..5c936e882 --- /dev/null +++ b/agbenchmark/challenges/code/d4_web_server/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestCreateSimpleWebServer", + "category": ["code"], + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "dependencies": ["TestDebugSimpleTypoWithGuidance"], + "cutoff": 90, + "ground": { + "answer": "GET localhost:8079/health responds with a 200 OK", + "should_contain": [], + "should_not_contain": ["AssertionError"], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "advanced", + "description": "Tests ability for the agent to build a simple web server locally", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/code/d5/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5/artifacts_out/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agbenchmark/challenges/code/d5/artifacts_out/code.py b/agbenchmark/challenges/code/d5/artifacts_out/code.py deleted file mode 100644 index 6056691da..000000000 --- a/agbenchmark/challenges/code/d5/artifacts_out/code.py +++ /dev/null @@ -1,23 +0,0 @@ -# mypy: ignore-errors -from typing import List, Optional - - -def three_sum(nums: List[int], target: int) -> Optional[List[int]]: - nums_indices = [(num, index) for index, num in enumerate(nums)] - nums_indices.sort() - for i in range(len(nums_indices) - 2): - if i > 0 and nums_indices[i] == nums_indices[i - 1]: - continue - l, r = i + 1, len(nums_indices) - 1 - while l < r: - three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0] - if three_sum < target: - l += 1 - elif three_sum > target: - r -= 1 - else: - indices = sorted( - [nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]] - ) - return indices - return None diff --git a/agbenchmark/challenges/code/d5/custom_python/test.py b/agbenchmark/challenges/code/d5/custom_python/test.py deleted file mode 100644 index 761b9f5c6..000000000 --- a/agbenchmark/challenges/code/d5/custom_python/test.py +++ /dev/null @@ -1,31 +0,0 @@ -# mypy: ignore-errors -from code import three_sum -from typing import List - - -def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None: - result = three_sum(nums, target) - print(result) - assert ( - result == expected_result - ), f"AssertionError: Expected the output to be {expected_result}" - - -if __name__ == "__main__": - # test the trivial case with the first three numbers - nums = [2, 7, 11, 15] - target = 20 - expected_result = [0, 1, 2] - test_three_sum(nums, target, expected_result) - - # test for ability to use zero and the same number twice - nums = [2, 7, 0, 15, 12, 0] - target = 2 - expected_result = [0, 2, 5] - test_three_sum(nums, target, expected_result) - - # test for first and last index usage and negative numbers - nums = [-6, 7, 11, 4] - target = 9 - expected_result = [0, 2, 3] - test_three_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json deleted file mode 100644 index 434b1312e..000000000 --- a/agbenchmark/challenges/code/d5/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestThreeSum", - "category": ["code", "iterate"], - "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", - "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"], - "cutoff": 60, - "ground": { - "answer": "The three_sum function coded properly.", - "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"], - "should_not_contain": [], - "files": ["test.py"], - "type": "execute_python_code" - }, - "info": { - "difficulty": "intermediate", - "description": "Tests ability for the agent to create the three_sum function.", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py new file mode 100644 index 000000000..6056691da --- /dev/null +++ b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py @@ -0,0 +1,23 @@ +# mypy: ignore-errors +from typing import List, Optional + + +def three_sum(nums: List[int], target: int) -> Optional[List[int]]: + nums_indices = [(num, index) for index, num in enumerate(nums)] + nums_indices.sort() + for i in range(len(nums_indices) - 2): + if i > 0 and nums_indices[i] == nums_indices[i - 1]: + continue + l, r = i + 1, len(nums_indices) - 1 + while l < r: + three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0] + if three_sum < target: + l += 1 + elif three_sum > target: + r -= 1 + else: + indices = sorted( + [nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]] + ) + return indices + return None diff --git a/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py b/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py new file mode 100644 index 000000000..761b9f5c6 --- /dev/null +++ b/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py @@ -0,0 +1,31 @@ +# mypy: ignore-errors +from code import three_sum +from typing import List + + +def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None: + result = three_sum(nums, target) + print(result) + assert ( + result == expected_result + ), f"AssertionError: Expected the output to be {expected_result}" + + +if __name__ == "__main__": + # test the trivial case with the first three numbers + nums = [2, 7, 11, 15] + target = 20 + expected_result = [0, 1, 2] + test_three_sum(nums, target, expected_result) + + # test for ability to use zero and the same number twice + nums = [2, 7, 0, 15, 12, 0] + target = 2 + expected_result = [0, 2, 5] + test_three_sum(nums, target, expected_result) + + # test for first and last index usage and negative numbers + nums = [-6, 7, 11, 4] + target = 9 + expected_result = [0, 2, 3] + test_three_sum(nums, target, expected_result) diff --git a/agbenchmark/challenges/code/d5_three_sum/data.json b/agbenchmark/challenges/code/d5_three_sum/data.json new file mode 100644 index 000000000..4df03b45b --- /dev/null +++ b/agbenchmark/challenges/code/d5_three_sum/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestThreeSum", + "category": ["code", "iterate"], + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"], + "cutoff": 60, + "ground": { + "answer": "The three_sum function coded properly.", + "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"], + "should_not_contain": [], + "files": ["test.py"], + "type": "execute_python_code" + }, + "info": { + "difficulty": "advanced", + "description": "Tests ability for the agent to create the three_sum function.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt deleted file mode 100644 index da3babb1f..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt +++ /dev/null @@ -1,2 +0,0 @@ -The id to remember is 2314 -Read the file instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt deleted file mode 100644 index 87c870b1a..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt +++ /dev/null @@ -1 +0,0 @@ -Read the file instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt deleted file mode 100644 index 987086208..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt +++ /dev/null @@ -1 +0,0 @@ -Read the file instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt deleted file mode 100644 index 78da710b8..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt +++ /dev/null @@ -1 +0,0 @@ -Read the file instructions_5.txt diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt deleted file mode 100644 index 8a2d35509..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt +++ /dev/null @@ -1 +0,0 @@ -Write the id previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt deleted file mode 100644 index 86be9d159..000000000 --- a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt +++ /dev/null @@ -1 +0,0 @@ -2314 diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json deleted file mode 100644 index 33c523244..000000000 --- a/agbenchmark/challenges/memory/m1/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestBasicMemory", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": ["TestReadFile", "TestWriteFile"], - "cutoff": 60, - "ground": { - "answer": "2314", - "should_contain": ["2314"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "info": { - "difficulty": "basic", - "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..da3babb1f --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt @@ -0,0 +1,2 @@ +The id to remember is 2314 +Read the file instructions_2.txt diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..87c870b1a --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt @@ -0,0 +1 @@ +Read the file instructions_3.txt diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..987086208 --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt @@ -0,0 +1 @@ +Read the file instructions_4.txt diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..78da710b8 --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt @@ -0,0 +1 @@ +Read the file instructions_5.txt diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..73246e041 --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt @@ -0,0 +1 @@ +Write the id previously mentioned to a file called result.txt. diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt b/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt new file mode 100644 index 000000000..86be9d159 --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt @@ -0,0 +1 @@ +2314 diff --git a/agbenchmark/challenges/memory/m1_id/data.json b/agbenchmark/challenges/memory/m1_id/data.json new file mode 100644 index 000000000..186d25801 --- /dev/null +++ b/agbenchmark/challenges/memory/m1_id/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestBasicMemory", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": ["TestReadFile", "TestWriteFile"], + "cutoff": 60, + "ground": { + "answer": "2314", + "should_contain": ["2314"], + "should_not_contain": [], + "files": ["result.txt"], + "type": "file" + }, + "info": { + "difficulty": "basic", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt deleted file mode 100644 index d304ce632..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 3145. Read the file instructions_2.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt deleted file mode 100644 index cfff66ba2..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 3791. Read the file instructions_3.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt deleted file mode 100644 index ad16b6fdc..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 9317. Read the file instructions_4.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt deleted file mode 100644 index 2394d64bc..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt +++ /dev/null @@ -1 +0,0 @@ -The id to remember is 9471. Read the file instructions_5.txt. diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt deleted file mode 100644 index 2f60afa17..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt +++ /dev/null @@ -1 +0,0 @@ -Write the ids previously mentioned to a .txt file. diff --git a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt deleted file mode 100644 index 7d48aaf18..000000000 --- a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt +++ /dev/null @@ -1,4 +0,0 @@ -3145 -3791 -9317 -9471 diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json deleted file mode 100644 index 840e8dc83..000000000 --- a/agbenchmark/challenges/memory/m2/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestRememberMultipleIds", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": ["TestBasicMemory"], - "cutoff": 60, - "ground": { - "answer": "3145\n3791\n9317\n9471", - "should_contain": ["3145", "3791", "9317", "9471"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "info": { - "difficulty": "novice", - "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..d304ce632 --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt @@ -0,0 +1 @@ +The id to remember is 3145. Read the file instructions_2.txt. diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..cfff66ba2 --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt @@ -0,0 +1 @@ +The id to remember is 3791. Read the file instructions_3.txt. diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..ad16b6fdc --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt @@ -0,0 +1 @@ +The id to remember is 9317. Read the file instructions_4.txt. diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..2394d64bc --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt @@ -0,0 +1 @@ +The id to remember is 9471. Read the file instructions_5.txt. diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..30ac41026 --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt @@ -0,0 +1 @@ +Write the ids previously mentioned to a file called result.txt. diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt new file mode 100644 index 000000000..7d48aaf18 --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt @@ -0,0 +1,4 @@ +3145 +3791 +9317 +9471 diff --git a/agbenchmark/challenges/memory/m2_multiple/data.json b/agbenchmark/challenges/memory/m2_multiple/data.json new file mode 100644 index 000000000..aac25b21e --- /dev/null +++ b/agbenchmark/challenges/memory/m2_multiple/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRememberMultipleIds", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": ["TestBasicMemory"], + "cutoff": 60, + "ground": { + "answer": "3145\n3791\n9317\n9471", + "should_contain": ["3145", "3791", "9317", "9471"], + "should_not_contain": [], + "files": ["result.txt"], + "type": "file" + }, + "info": { + "difficulty": "novice", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt deleted file mode 100644 index 8deee1b09..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt +++ /dev/null @@ -1,5 +0,0 @@ -xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk - -The id to remember is 3145. Read the file instructions_2.txt. - -OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt deleted file mode 100644 index c9b5fc5f9..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt +++ /dev/null @@ -1,5 +0,0 @@ -2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 - -The id to remember is 3791. Read the file instructions_3.txt. - -BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt deleted file mode 100644 index a564a7c91..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt +++ /dev/null @@ -1,5 +0,0 @@ -ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx - -The id to remember is 9317. Read the file instructions_4.txt. - -8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt deleted file mode 100644 index 63fcd54fa..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt +++ /dev/null @@ -1,5 +0,0 @@ -RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA - -The id to remember is 9471. Read the file instructions_5.txt. - -1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt deleted file mode 100644 index 99c9efa35..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt +++ /dev/null @@ -1,5 +0,0 @@ -sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 - -Write the ids previously mentioned to a .txt file. - -gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt deleted file mode 100644 index 7d48aaf18..000000000 --- a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt +++ /dev/null @@ -1,4 +0,0 @@ -3145 -3791 -9317 -9471 diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json deleted file mode 100644 index 3af2fb3a5..000000000 --- a/agbenchmark/challenges/memory/m3/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestRememberMultipleIdsWithNoise", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": ["TestRememberMultipleIds"], - "cutoff": 60, - "ground": { - "answer": "3145\n3791\n9317\n9471", - "should_contain": ["3145", "3791", "9317", "9471"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "info": { - "difficulty": "intermediate", - "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..8deee1b09 --- /dev/null +++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt @@ -0,0 +1,5 @@ +xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk + +The id to remember is 3145. Read the file instructions_2.txt. + +OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..c9b5fc5f9 --- /dev/null +++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt @@ -0,0 +1,5 @@ +2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 + +The id to remember is 3791. Read the file instructions_3.txt. + +BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..a564a7c91 --- /dev/null +++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt @@ -0,0 +1,5 @@ +ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx + +The id to remember is 9317. Read the file instructions_4.txt. + +8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..63fcd54fa --- /dev/null +++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt @@ -0,0 +1,5 @@ +RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA + +The id to remember is 9471. Read the file instructions_5.txt. + +1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..aa379ea51 --- /dev/null +++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt @@ -0,0 +1,5 @@ +sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 + +Write the ids previously mentioned to a file called result.txt. + +gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt new file mode 100644 index 000000000..7d48aaf18 --- /dev/null +++ b/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt @@ -0,0 +1,4 @@ +3145 +3791 +9317 +9471 diff --git a/agbenchmark/challenges/memory/m3_noise/data.json b/agbenchmark/challenges/memory/m3_noise/data.json new file mode 100644 index 000000000..8234e25c7 --- /dev/null +++ b/agbenchmark/challenges/memory/m3_noise/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRememberMultipleWithNoise", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": ["TestRememberMultipleIds"], + "cutoff": 75, + "ground": { + "answer": "3145\n3791\n9317\n9471", + "should_contain": ["3145", "3791", "9317", "9471"], + "should_not_contain": [], + "files": ["result.txt"], + "type": "file" + }, + "info": { + "difficulty": "intermediate", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt deleted file mode 100644 index 1b1e0147c..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt +++ /dev/null @@ -1,5 +0,0 @@ -xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk - -The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt. - -OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt deleted file mode 100644 index 92203f5c1..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt +++ /dev/null @@ -1,5 +0,0 @@ -2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 - -The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt. - -BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt deleted file mode 100644 index d7f6f08fc..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt +++ /dev/null @@ -1,5 +0,0 @@ -ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx - -The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt. - -8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt deleted file mode 100644 index 6c0a13bae..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt +++ /dev/null @@ -1,5 +0,0 @@ -RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA - -The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt. - -1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt deleted file mode 100644 index b4ef60bad..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt +++ /dev/null @@ -1,5 +0,0 @@ -sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 - -Write the phrases previously mentioned to a .txt file. - -gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt deleted file mode 100644 index 9b8405bf1..000000000 --- a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt +++ /dev/null @@ -1,4 +0,0 @@ -The purple elephant danced on a rainbow while eating a taco -The sneaky toaster stole my socks and ran away to Hawaii -My pet rock sings better than Beyoncé on Tuesdays -The giant hamster rode a unicycle through the crowded mall diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json deleted file mode 100644 index 17a078e1b..000000000 --- a/agbenchmark/challenges/memory/m4/data.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "name": "TestRememberMultiplePhrasesWithNoise", - "category": ["memory"], - "task": "Follow the instructions in the instructions_1.txt file", - "dependencies": ["TestRememberMultipleIdsWithNoise"], - "cutoff": 60, - "ground": { - "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", - "should_contain": [ - "The purple elephant danced on a rainbow while eating a taco", - "The sneaky toaster stole my socks and ran away to Hawaii", - "My pet rock sings better than Beyoncé on Tuesdays", - "The giant hamster rode a unicycle through the crowded mall" - ], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "info": { - "difficulty": "advanced", - "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt new file mode 100644 index 000000000..1b1e0147c --- /dev/null +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt @@ -0,0 +1,5 @@ +xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk + +The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt. + +OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt new file mode 100644 index 000000000..92203f5c1 --- /dev/null +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt @@ -0,0 +1,5 @@ +2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1 + +The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt. + +BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt new file mode 100644 index 000000000..dc444c2ab --- /dev/null +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt @@ -0,0 +1,5 @@ +ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx + +The phrase to remember is 'My pet rock sings better than Beyonce on Tuesdays'. Read the file instructions_4.txt. + +8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt new file mode 100644 index 000000000..6c0a13bae --- /dev/null +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt @@ -0,0 +1,5 @@ +RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA + +The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt. + +1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt new file mode 100644 index 000000000..2092b42a2 --- /dev/null +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt @@ -0,0 +1,5 @@ +sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232 + +Write the phrases previously mentioned to a file called result.txt. + +gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0 diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt new file mode 100644 index 000000000..bdab23d9b --- /dev/null +++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt @@ -0,0 +1,4 @@ +The purple elephant danced on a rainbow while eating a taco +The sneaky toaster stole my socks and ran away to Hawaii +My pet rock sings better than Beyonce on Tuesdays +The giant hamster rode a unicycle through the crowded mall diff --git a/agbenchmark/challenges/memory/m4_phrases/data.json b/agbenchmark/challenges/memory/m4_phrases/data.json new file mode 100644 index 000000000..da17fb02a --- /dev/null +++ b/agbenchmark/challenges/memory/m4_phrases/data.json @@ -0,0 +1,24 @@ +{ + "name": "TestRememberMultiplePhrasesWithNoise", + "category": ["memory"], + "task": "Follow the instructions in the instructions_1.txt file", + "dependencies": ["TestRememberMultipleWithNoise"], + "cutoff": 90, + "ground": { + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "should_contain": [ + "The purple elephant danced on a rainbow while eating a taco", + "The sneaky toaster stole my socks and ran away to Hawaii", + "My pet rock sings better than Beyonce on Tuesdays", + "The giant hamster rode a unicycle through the crowded mall" + ], + "should_not_contain": [], + "files": ["result.txt"], + "type": "file" + }, + "info": { + "difficulty": "advanced", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt deleted file mode 100644 index f558a0f94..000000000 --- a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt +++ /dev/null @@ -1 +0,0 @@ -25.89 diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json deleted file mode 100644 index c3af4862d..000000000 --- a/agbenchmark/challenges/retrieval/r1/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestBasicRetrieval", - "category": ["retrieval"], - "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", - "dependencies": ["TestWriteFile", "TestSearch"], - "cutoff": 60, - "ground": { - "answer": "£25.89", - "should_contain": ["25.89"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "info": { - "difficulty": "basic", - "description": "Tests ability to retrieve information from a website.", - "side_effects": [] - } -} diff --git a/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt new file mode 100644 index 000000000..f558a0f94 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt @@ -0,0 +1 @@ +25.89 diff --git a/agbenchmark/challenges/retrieval/r1_book_price/data.json b/agbenchmark/challenges/retrieval/r1_book_price/data.json new file mode 100644 index 000000000..9a6924daf --- /dev/null +++ b/agbenchmark/challenges/retrieval/r1_book_price/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestBasicRetrieval", + "category": ["retrieval"], + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "dependencies": ["TestWriteFile", "TestSearch"], + "cutoff": 60, + "ground": { + "answer": "£25.89", + "should_contain": ["25.89"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "basic", + "description": "Specifies specific website to retrieve website from.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt new file mode 100644 index 000000000..8a0eae046 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt @@ -0,0 +1 @@ +81,462 Millions diff --git a/agbenchmark/challenges/retrieval/r2.1_specific/data.json b/agbenchmark/challenges/retrieval/r2.1_specific/data.json new file mode 100644 index 000000000..3fcd50801 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2.1_specific/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRetrieval2.1", + "category": ["retrieval"], + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "dependencies": ["TestRetrieval2.0"], + "cutoff": 60, + "ground": { + "answer": "It was $81.462 billion in 2022.", + "should_contain": ["81", "462"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "novice", + "description": "This one checks the accuracy of the information over r2", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt new file mode 100644 index 000000000..8a0eae046 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt @@ -0,0 +1 @@ +81,462 Millions diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/data.json b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json new file mode 100644 index 000000000..294effeff --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRetrieval2.2", + "category": ["retrieval"], + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "dependencies": ["TestRetrieval2.1"], + "cutoff": 60, + "ground": { + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "should_contain": ["81,462"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "intermediate", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt deleted file mode 100644 index 8a0eae046..000000000 --- a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt +++ /dev/null @@ -1 +0,0 @@ -81,462 Millions diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json deleted file mode 100644 index f558b8584..000000000 --- a/agbenchmark/challenges/retrieval/r2/data.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "name": "TestRetrieval2", - "category": ["retrieval"], - "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": ["TestBasicRetrieval"], - "cutoff": 60, - "ground": { - "answer": "81,462", - "should_contain": ["81,462"], - "should_not_contain": [], - "files": [".txt"], - "type": "file" - }, - "info": { - "difficulty": "novice", - "description": "Tests ability to retrieve information.", - "side_effects": ["tests if there is in fact an LLM attached"] - } -} diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt new file mode 100644 index 000000000..8a0eae046 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt @@ -0,0 +1 @@ +81,462 Millions diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json new file mode 100644 index 000000000..63665ed63 --- /dev/null +++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json @@ -0,0 +1,19 @@ +{ + "name": "TestRetrieval2.0", + "category": ["retrieval"], + "task": "Write tesla's revenue in 2022 into a .txt file.", + "dependencies": ["TestBasicRetrieval"], + "cutoff": 60, + "ground": { + "answer": "It was $81.462 billion in 2022.", + "should_contain": ["81"], + "should_not_contain": [], + "files": [".txt"], + "type": "file" + }, + "info": { + "difficulty": "novice", + "description": "A no guardrails search for info", + "side_effects": [] + } +} diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json index eb998ffbf..d3e4bd0f5 100644 --- a/agbenchmark/challenges/retrieval/r3/data.json +++ b/agbenchmark/challenges/retrieval/r3/data.json @@ -2,7 +2,7 @@ "name": "TestRetrieval3", "category": ["retrieval"], "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "dependencies": ["TestRetrieval2"], + "dependencies": ["TestRetrieval2.1"], "cutoff": 60, "ground": { "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py index 4a62af0b5..b544d2c6e 100644 --- a/agbenchmark/conftest.py +++ b/agbenchmark/conftest.py @@ -135,8 +135,8 @@ internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json")) def pytest_runtest_makereport(item: Any, call: Any) -> None: + challenge_data = item.funcargs.get("challenge_data", None) if call.when == "call": - challenge_data = item.funcargs.get("challenge_data", None) difficulty = ( challenge_data["info"]["difficulty"] if challenge_data else "unknown" ) @@ -157,6 +157,9 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: info_details: Any = { "data_path": challenge_location, "is_regression": False, + "task": challenge_data["task"], + "answer": challenge_data["ground"]["answer"], + "description": challenge_data["info"]["description"], "metrics": { "difficulty": difficulty, "success": False, @@ -218,6 +221,10 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None: "run_time" ] = f"{str(round(run_time, 3))} seconds" + info_details["reached_cutoff"] = ( + float(run_time) > challenge_data["cutoff"] + ) + info_manager.add_test(test_name, info_details) diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json index 0bfad744a..d20e8c778 100644 --- a/agbenchmark/reports/internal_info.json +++ b/agbenchmark/reports/internal_info.json @@ -3,70 +3,83 @@ "TestBasicMemory": [ true, true, - true - ], - "TestBasicRetrieval": [ true, true, - true - ], - "TestCreateSimpleWebServer": [ - false, - false, - false - ], - "TestDebugSimpleTypoWithGuidance": [ - false, - false, - false - ], - "TestDebugSimpleTypoWithoutGuidance": [ + true, false, false, - false + true ], - "TestReadFile": [ + "TestBasicRetrieval": [ true, true, true, true, true ], - "TestRememberMultipleIds": [ + "TestReadFile": [ true, true, - true - ], - "TestRememberMultipleIdsWithNoise": [ true, true, true ], - "TestRememberMultiplePhrasesWithNoise": [ + "TestSearch": [ true, true, - true - ], - "TestRetrieval2": [ true, true, true ], - "TestRetrieval3": [ + "TestWriteFile": [ + true, true, true, true ], - "TestSearch": [ - true, - true, + "TestRetrieval2.2": [ + false, + false, + false, + false + ], + "TestRetrieval2.1": [ + false, + false, + false, + false, + false, + false + ], + "TestRetrieval2.0": [ true, + false + ], + "TestRememberMultipleIds": [ + false, + false, true ], - "TestWriteFile": [ - true, - true, + "TestRememberMultipleIdsWithNoise": [ + false + ], + "TestRememberMultipleWithNoise": [ + false, true + ], + "TestRememberMultiplePhrasesWithNoise": [ + false, + false, + false, + false, + false, + false + ], + "TestDebugSimpleTypoWithGuidance": [ + false, + false, + false, + false ] } } \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json index 637c2d5c5..419052311 100644 --- a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json +++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json @@ -1,36 +1,27 @@ { + "command": "agbenchmark start --test TestWriteFile", + "completion_time": "2023-07-17-13:34", + "metrics": { + "run_time": "23.83 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": true, - "metrics": { - "difficulty": "interface", - "success": true, - "non_mock_success_%": 100.0, - "run_time": "0.009 seconds" - } - }, - "additional": { - "model": "gpt-3.5-turbo" - }, - "command": "agbenchmark start --test TestWriteFile", - "completion_time": "2023-07-17-09:54", - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}" - }, - "metrics": { - "run_time": "22.36 seconds", - "highest_difficulty": "interface: 1" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "metrics": { - "difficulty": "interface", - "success": true, - "success_%": 40.0, - "run_time": "22.169 seconds" - } - } + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": true, + "reached_cutoff": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "23.627 seconds" + } } -} \ No newline at end of file + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json new file mode 100644 index 000000000..811fd3e85 --- /dev/null +++ b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleWithNoise", + "completion_time": "2023-07-17-21:24", + "metrics": { + "run_time": "77.71 seconds", + "highest_difficulty": "intermediate: 4" + }, + "tests": { + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": true, + "success_%": 50.0, + "run_time": "77.397 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json new file mode 100644 index 000000000..08c2b7075 --- /dev/null +++ b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleWithNoise", + "completion_time": "2023-07-17-21:19", + "metrics": { + "run_time": "74.3 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "74.059 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 000000000..0de6f003c --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:28", + "metrics": { + "run_time": "60.86 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "60.631 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 000000000..1d2abb8e7 --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:32", + "metrics": { + "run_time": "73.04 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "72.736 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 000000000..1d256b8c0 --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:34", + "metrics": { + "run_time": "81.59 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "81.374 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 000000000..e67a6ac3e --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:36", + "metrics": { + "run_time": "98.32 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "98.021 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 000000000..9e76704db --- /dev/null +++ b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:42", + "metrics": { + "run_time": "303.13 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "302.919 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json new file mode 100644 index 000000000..e98ca330e --- /dev/null +++ b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise", + "completion_time": "2023-07-17-21:27", + "metrics": { + "run_time": "77.72 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "77.491 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 000000000..9c9f3dc2a --- /dev/null +++ b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:46", + "metrics": { + "run_time": "87.21 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "86.967 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 000000000..4765201fb --- /dev/null +++ b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:47", + "metrics": { + "run_time": "48.52 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "48.208 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 000000000..ac2592f33 --- /dev/null +++ b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,28 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:55", + "metrics": { + "run_time": "54.95 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "54.741 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file diff --git a/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json new file mode 100644 index 000000000..e84c6e9a8 --- /dev/null +++ b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", + "completion_time": "2023-07-17-21:44", + "metrics": { + "run_time": "63.37 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "63.125 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json index e64783190..6ac7d1045 100644 --- a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json +++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json @@ -9,6 +9,7 @@ "TestWriteFile": { "data_path": "agbenchmark/challenges/interface/write_file", "is_regression": false, + "reached_cutoff": false, "metrics": { "difficulty": "interface", "success": true, @@ -18,8 +19,7 @@ } }, "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { "model": "gpt-4" diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json index b5d73af99..4758addf1 100644 --- a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json +++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json @@ -9,6 +9,7 @@ "TestReadFile": { "data_path": "agbenchmark/challenges/interface/read_file", "is_regression": true, + "reached_cutoff": true, "metrics": { "difficulty": "interface", "success": true, @@ -21,7 +22,6 @@ "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { - "model": "gpt-4", - "reached_termination_time": true + "model": "gpt-3.5-turbo" } } diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json index 869eaaac1..87c7956d6 100644 --- a/agbenchmark/reports/mini-agi/2_TestReadFile.json +++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json @@ -9,6 +9,7 @@ "TestReadFile": { "data_path": "agbenchmark/challenges/interface/read_file", "is_regression": true, + "reached_cutoff": false, "metrics": { "difficulty": "interface", "success": true, @@ -18,8 +19,7 @@ } }, "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { "model": "gpt-4" diff --git a/agbenchmark/reports/mini-agi/3.1_TestSearch.json b/agbenchmark/reports/mini-agi/3.1_TestSearch.json new file mode 100644 index 000000000..6a2744e72 --- /dev/null +++ b/agbenchmark/reports/mini-agi/3.1_TestSearch.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestSearch", + "completion_time": "2023-07-17-13:35", + "metrics": { + "run_time": "20.58 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "reached_cutoff": false, + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "20.367 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json index d9d05db4a..c7d6c4309 100644 --- a/agbenchmark/reports/mini-agi/3_TestSearch.json +++ b/agbenchmark/reports/mini-agi/3_TestSearch.json @@ -9,6 +9,7 @@ "TestSearch": { "data_path": "agbenchmark/challenges/interface/search", "is_regression": true, + "reached_cutoff": false, "metrics": { "difficulty": "interface", "success": true, @@ -18,8 +19,7 @@ } }, "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" + "workspace": "${os.path.join(Path.home(), 'miniagi')}" }, "additional": { "model": "gpt-4" diff --git a/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json new file mode 100644 index 000000000..6ff0fa63b --- /dev/null +++ b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestBasicRetrieval", + "completion_time": "2023-07-17-13:31", + "metrics": { + "run_time": "26.05 seconds", + "highest_difficulty": "basic: 2" + }, + "tests": { + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": true, + "reached_cutoff": false, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "25.818 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json deleted file mode 100644 index d72d599d8..000000000 --- a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", - "completion_time": "2023-07-15-22:16", - "metrics": { - "run_time": "45.92 seconds", - "highest_difficulty": ": 0" - }, - "tests": { - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "success_%": 0.0, - "run_time": "45.599 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" - }, - "additional": { - "model": "gpt-4" - } -} diff --git a/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json new file mode 100644 index 000000000..54c4fdcca --- /dev/null +++ b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json @@ -0,0 +1,27 @@ +{ + "command": "agbenchmark start --test TestBasicRetrieval", + "completion_time": "2023-07-17-13:22", + "metrics": { + "run_time": "61.24 seconds", + "highest_difficulty": "basic: 2" + }, + "tests": { + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1", + "is_regression": true, + "reached_cutoff": true, + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "60.872 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json deleted file mode 100644 index 7985a7843..000000000 --- a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance", - "completion_time": "2023-07-15-22:15", - "metrics": { - "run_time": "32.99 seconds", - "highest_difficulty": ": 0" - }, - "tests": { - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1", - "is_regression": false, - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "success_%": 0.0, - "run_time": "32.582 seconds" - } - } - }, - "config": { - "workspace": "${os.path.join(Path.home(), 'miniagi')}", - "entry_path": "agbenchmark.benchmarks" - }, - "additional": { - "model": "gpt-4" - } -} diff --git a/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json new file mode 100644 index 000000000..4149ebe70 --- /dev/null +++ b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.0", + "completion_time": "2023-07-17-17:10", + "metrics": { + "run_time": "66.81 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "reached_cutoff": true, + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "66.547 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json new file mode 100644 index 000000000..28d091d28 --- /dev/null +++ b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json @@ -0,0 +1,29 @@ +{ + "command": "agbenchmark start --test TestRetrieval2", + "completion_time": "2023-07-17-13:54", + "metrics": { + "run_time": "36 seconds", + "highest_difficulty": "TestRetrieval2: 3" + }, + "tests": { + "TestRetrieval2": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "reached_cutoff": false, + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 50.0, + "run_time": "35.59 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json new file mode 100644 index 000000000..ed3ede1d3 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:27", + "metrics": { + "run_time": "64.44 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "reached_cutoff:": true, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0, 0.0]", + "success_%": 0.0, + "run_time": "64.216 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json new file mode 100644 index 000000000..04f972329 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:53", + "metrics": { + "run_time": "30.08 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "29.711 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json new file mode 100644 index 000000000..383774347 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:54", + "metrics": { + "run_time": "27.49 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "27.266 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json new file mode 100644 index 000000000..71cd9e007 --- /dev/null +++ b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-17:56", + "metrics": { + "run_time": "23.64 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "23.42 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json new file mode 100644 index 000000000..1dceec03d --- /dev/null +++ b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.1", + "completion_time": "2023-07-17-14:03", + "metrics": { + "run_time": "68.39 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "reached_cutoff": true, + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "68.15 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json new file mode 100644 index 000000000..99373f7f1 --- /dev/null +++ b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.2", + "completion_time": "2023-07-17-17:57", + "metrics": { + "run_time": "31.1 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "30.888 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json new file mode 100644 index 000000000..ccdca26b3 --- /dev/null +++ b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRetrieval2.2", + "completion_time": "2023-07-17-14:04", + "metrics": { + "run_time": "28.08 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "reached_cutoff": false, + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "27.857 seconds" + } + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json new file mode 100644 index 000000000..66cc2f9ae --- /dev/null +++ b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestBasicMemory", + "completion_time": "2023-07-17-18:22", + "metrics": { + "run_time": "53.48 seconds", + "highest_difficulty": "basic: 2" + }, + "tests": { + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 75.0, + "run_time": "53.252 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/8_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json new file mode 100644 index 000000000..7ce535507 --- /dev/null +++ b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestBasicMemory", + "completion_time": "2023-07-17-18:18", + "metrics": { + "run_time": "62.11 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\results.txt'", + "success_%": 83.33, + "run_time": "61.879 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json new file mode 100644 index 000000000..462e73900 --- /dev/null +++ b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json @@ -0,0 +1,30 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleIds", + "completion_time": "2023-07-17-20:22", + "metrics": { + "run_time": "57.6 seconds", + "highest_difficulty": "novice: 3" + }, + "tests": { + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 33.33, + "run_time": "57.355 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-4" + } +} diff --git a/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json new file mode 100644 index 000000000..aa726196e --- /dev/null +++ b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json @@ -0,0 +1,31 @@ +{ + "command": "agbenchmark start --test TestRememberMultipleIds", + "completion_time": "2023-07-17-18:33", + "metrics": { + "run_time": "61.32 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'", + "success_%": 0.0, + "run_time": "61.089 seconds" + }, + "reached_cutoff": true + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + }, + "additional": { + "model": "gpt-3.5-turbo" + } +} diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py index f1ed43639..63f4d836c 100644 --- a/agbenchmark/utils.py +++ b/agbenchmark/utils.py @@ -55,7 +55,6 @@ def calculate_info_test_path(reports_path: Path) -> str: all_prefix_numbers.append(math.floor(number)) max_prefix = max(all_prefix_numbers, default=0) - print("HEY WE ARE HERE BIG DAWG", max_prefix) run_name = f"{max_prefix + 1}_{test_arg}.json" else: # Take the number from before the _ and add the .{number} @@ -118,7 +117,9 @@ def get_highest_success_difficulty(data: dict) -> str: else: highest_difficulty_str = "" - return f"{highest_difficulty_str}: {highest_difficulty_level}" + if highest_difficulty_level: + return f"{highest_difficulty_str}: {highest_difficulty_level}" + return "No successful tests" def assign_paths(folder_path: Path) -> tuple[str, str, str]: diff --git a/agent/mini-agi b/agent/mini-agi index 0a9fcd8c3..4a346ab7c 160000 --- a/agent/mini-agi +++ b/agent/mini-agi @@ -1 +1 @@ -Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d +Subproject commit 4a346ab7cb8dbcfd3bf2cee49448d26e01406ba3 -- cgit v1.2.3 From 328643e5f2bb6f447b02b6fe5779eea6dcd0db59 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 02:44:35 +0000 Subject: smol-developer-20230718024435 --- reports/smol-developer/file2_07-18-02-43.json | 266 ++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 reports/smol-developer/file2_07-18-02-43.json diff --git a/reports/smol-developer/file2_07-18-02-43.json b/reports/smol-developer/file2_07-18-02-43.json new file mode 100644 index 000000000..c07c322be --- /dev/null +++ b/reports/smol-developer/file2_07-18-02-43.json @@ -0,0 +1,266 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-02:44", + "metrics": { + "run_time": "38.24 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "6.509 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "9.258 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "13.455 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "7.264 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.572 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From 0eb04c1e342e7b937f86797884dffff5972c8adb Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 02:45:45 +0000 Subject: Auto-GPT-20230718024545 --- reports/Auto-GPT/file2_07-18-02-45.json | 268 ++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 reports/Auto-GPT/file2_07-18-02-45.json diff --git a/reports/Auto-GPT/file2_07-18-02-45.json b/reports/Auto-GPT/file2_07-18-02-45.json new file mode 100644 index 000000000..f0cc9b962 --- /dev/null +++ b/reports/Auto-GPT/file2_07-18-02-45.json @@ -0,0 +1,268 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-02:45", + "metrics": { + "run_time": "26.11 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "25.778 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.006 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 857b72df87a0a9e7977199369ece51c540d1e6e4 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 02:46:36 +0000 Subject: gpt-engineer-20230718024636 --- reports/gpt-engineer/file2_07-18-02-44.json | 267 ++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 reports/gpt-engineer/file2_07-18-02-44.json diff --git a/reports/gpt-engineer/file2_07-18-02-44.json b/reports/gpt-engineer/file2_07-18-02-44.json new file mode 100644 index 000000000..1c24f4f8c --- /dev/null +++ b/reports/gpt-engineer/file2_07-18-02-44.json @@ -0,0 +1,267 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-02:46", + "metrics": { + "run_time": "123.02 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "72.83 seconds" + }, + "reached_cutoff": true + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'", + "success_%": 0.0, + "run_time": "47.884 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'", + "success_%": 0.0, + "run_time": "0.955 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "0.944 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.003 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "projects/my-new-project/workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 86978b59246de8cce22928f653974fca51fc358c Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 02:51:30 +0000 Subject: mini-agi-20230718025130 --- reports/mini-agi/file1_07-18-02-44.json | 260 ++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 reports/mini-agi/file1_07-18-02-44.json diff --git a/reports/mini-agi/file1_07-18-02-44.json b/reports/mini-agi/file1_07-18-02-44.json new file mode 100644 index 000000000..a8e9f0fe4 --- /dev/null +++ b/reports/mini-agi/file1_07-18-02-44.json @@ -0,0 +1,260 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-02:51", + "metrics": { + "run_time": "407.24 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": true, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "14.551 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "37.551 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "19.674 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": true, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "21.582 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": true, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "23.659 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 20.0, + "run_time": "45.503 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: '/home/runner/miniagi/result.txt'", + "success_%": 66.67, + "run_time": "69.968 seconds" + }, + "reached_cutoff": true + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 66.67, + "run_time": "30.055 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 100.0, + "run_time": "57.289 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "63.121 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 25.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "24.052 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 33.33, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file -- cgit v1.2.3 From e5856588653303244ce769a7d5d70320f3048806 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 08:21:11 +0000 Subject: gpt-engineer-20230718082111 --- reports/gpt-engineer/file3_07-18-08-19.json | 267 ++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 reports/gpt-engineer/file3_07-18-08-19.json diff --git a/reports/gpt-engineer/file3_07-18-08-19.json b/reports/gpt-engineer/file3_07-18-08-19.json new file mode 100644 index 000000000..10e4cf79e --- /dev/null +++ b/reports/gpt-engineer/file3_07-18-08-19.json @@ -0,0 +1,267 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-08:21", + "metrics": { + "run_time": "123.71 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "65.886 seconds" + }, + "reached_cutoff": true + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'", + "success_%": 0.0, + "run_time": "55.938 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "0.788 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'", + "success_%": 0.0, + "run_time": "0.787 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "projects/my-new-project/workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From b2f52f08f4f66beb2ce7de15801a50feb1836c7b Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 08:22:55 +0000 Subject: Auto-GPT-20230718082255 --- reports/Auto-GPT/file3_07-18-08-19.json | 267 ++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 reports/Auto-GPT/file3_07-18-08-19.json diff --git a/reports/Auto-GPT/file3_07-18-08-19.json b/reports/Auto-GPT/file3_07-18-08-19.json new file mode 100644 index 000000000..653f7a8da --- /dev/null +++ b/reports/Auto-GPT/file3_07-18-08-19.json @@ -0,0 +1,267 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-08:22", + "metrics": { + "run_time": "202.62 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "36.149 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "81.97 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "23.569 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "60.708 seconds" + }, + "reached_cutoff": true + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 15d50a5ccb8b59bbee5e88c579726f4487a0eca3 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 08:24:43 +0000 Subject: smol-developer-20230718082443 --- reports/smol-developer/file3_07-18-08-19.json | 266 ++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 reports/smol-developer/file3_07-18-08-19.json diff --git a/reports/smol-developer/file3_07-18-08-19.json b/reports/smol-developer/file3_07-18-08-19.json new file mode 100644 index 000000000..7124e24c3 --- /dev/null +++ b/reports/smol-developer/file3_07-18-08-19.json @@ -0,0 +1,266 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-08:24", + "metrics": { + "run_time": "315.94 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "5.447 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "286.755 seconds" + }, + "reached_cutoff": true + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "12.291 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "9.912 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.322 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From d46124a9d82eb3d1fe4aa53acd165dc4a9817820 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Tue, 18 Jul 2023 09:17:45 -0700 Subject: Push reports to google drive (#167) Signed-off-by: Merwane Hamadi --- .github/workflows/ci.yml | 3 + .gitignore | 1 + json_to_base_64.py | 17 ++ poetry.lock | 753 ++++++++++++++++++++++++++++++++++------------- pyproject.toml | 3 + send_to_googledrive.py | 112 +++++++ 6 files changed, 682 insertions(+), 207 deletions(-) create mode 100644 json_to_base_64.py create mode 100644 send_to_googledrive.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3b0dc50fe..e34b2e864 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -191,3 +191,6 @@ jobs: git fetch origin $current_branch git rebase origin/$current_branch git push origin HEAD + poetry run python send_to_googledrive.py + env: + GDRIVE_BASE64: ${{ secrets.GDRIVE_BASE64 }} diff --git a/.gitignore b/.gitignore index 7d0419ca4..1b0f3ba14 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ .idea/ .DS_Store ``` +secrets.json diff --git a/json_to_base_64.py b/json_to_base_64.py new file mode 100644 index 000000000..42cbc4df8 --- /dev/null +++ b/json_to_base_64.py @@ -0,0 +1,17 @@ +import base64 +import json + +# Load JSON data from a file +with open("secrets.json", "r") as f: + data = json.load(f) + +# Convert the JSON object into a string +json_string = json.dumps(data) + +# Encode the string into bytes +json_bytes = json_string.encode("utf-8") + +# Convert the bytes to a base64 string +base64_string = base64.b64encode(json_bytes).decode("utf-8") + +print(base64_string) diff --git a/poetry.lock b/poetry.lock index ad72f5e10..5b51cb14e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -217,6 +217,18 @@ d = ["aiohttp (>=3.7.4)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "cachetools" +version = "5.3.1" +description = "Extensible memoizing collections and decorators" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, + {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, +] + [[package]] name = "certifi" version = "2023.5.7" @@ -231,99 +243,99 @@ files = [ [[package]] name = "charset-normalizer" -version = "3.1.0" +version = "3.2.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = false python-versions = ">=3.7.0" files = [ - {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"}, - {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"}, + {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"}, + {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, ] [[package]] name = "click" -version = "8.1.3" +version = "8.1.5" description = "Composable command line interface toolkit" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, - {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, + {file = "click-8.1.5-py3-none-any.whl", hash = "sha256:e576aa487d679441d7d30abb87e1b43d24fc53bffb8758443b1a9e1cee504548"}, + {file = "click-8.1.5.tar.gz", hash = "sha256:4be4b1af8d665c6d942909916d31a213a106800c47d0eeba73d34da3cbc11367"}, ] [package.dependencies] @@ -343,14 +355,14 @@ files = [ [[package]] name = "exceptiongroup" -version = "1.1.1" +version = "1.1.2" description = "Backport of PEP 654 (exception groups)" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, + {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"}, + {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"}, ] [package.extras] @@ -375,86 +387,73 @@ pyflakes = ">=2.3.0,<2.4.0" [[package]] name = "frozenlist" -version = "1.3.3" +version = "1.4.0" description = "A list-like structure which implements collections.abc.MutableSequence" category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4"}, - {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dfbac4c2dfcc082fcf8d942d1e49b6aa0766c19d3358bd86e2000bf0fa4a9cf0"}, - {file = "frozenlist-1.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b1c63e8d377d039ac769cd0926558bb7068a1f7abb0f003e3717ee003ad85530"}, - {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fdfc24dcfce5b48109867c13b4cb15e4660e7bd7661741a391f821f23dfdca7"}, - {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c926450857408e42f0bbc295e84395722ce74bae69a3b2aa2a65fe22cb14b99"}, - {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1841e200fdafc3d51f974d9d377c079a0694a8f06de2e67b48150328d66d5483"}, - {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f470c92737afa7d4c3aacc001e335062d582053d4dbe73cda126f2d7031068dd"}, - {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:783263a4eaad7c49983fe4b2e7b53fa9770c136c270d2d4bbb6d2192bf4d9caf"}, - {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:924620eef691990dfb56dc4709f280f40baee568c794b5c1885800c3ecc69816"}, - {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ae4dc05c465a08a866b7a1baf360747078b362e6a6dbeb0c57f234db0ef88ae0"}, - {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:bed331fe18f58d844d39ceb398b77d6ac0b010d571cba8267c2e7165806b00ce"}, - {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:02c9ac843e3390826a265e331105efeab489ffaf4dd86384595ee8ce6d35ae7f"}, - {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9545a33965d0d377b0bc823dcabf26980e77f1b6a7caa368a365a9497fb09420"}, - {file = "frozenlist-1.3.3-cp310-cp310-win32.whl", hash = "sha256:d5cd3ab21acbdb414bb6c31958d7b06b85eeb40f66463c264a9b343a4e238642"}, - {file = "frozenlist-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b756072364347cb6aa5b60f9bc18e94b2f79632de3b0190253ad770c5df17db1"}, - {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b4395e2f8d83fbe0c627b2b696acce67868793d7d9750e90e39592b3626691b7"}, - {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14143ae966a6229350021384870458e4777d1eae4c28d1a7aa47f24d030e6678"}, - {file = "frozenlist-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d8860749e813a6f65bad8285a0520607c9500caa23fea6ee407e63debcdbef6"}, - {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8"}, - {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb82dbba47a8318e75f679690190c10a5e1f447fbf9df41cbc4c3afd726d88cb"}, - {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9309869032abb23d196cb4e4db574232abe8b8be1339026f489eeb34a4acfd91"}, - {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a97b4fe50b5890d36300820abd305694cb865ddb7885049587a5678215782a6b"}, - {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c188512b43542b1e91cadc3c6c915a82a5eb95929134faf7fd109f14f9892ce4"}, - {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:303e04d422e9b911a09ad499b0368dc551e8c3cd15293c99160c7f1f07b59a48"}, - {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0771aed7f596c7d73444c847a1c16288937ef988dc04fb9f7be4b2aa91db609d"}, - {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:66080ec69883597e4d026f2f71a231a1ee9887835902dbe6b6467d5a89216cf6"}, - {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:41fe21dc74ad3a779c3d73a2786bdf622ea81234bdd4faf90b8b03cad0c2c0b4"}, - {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f20380df709d91525e4bee04746ba612a4df0972c1b8f8e1e8af997e678c7b81"}, - {file = "frozenlist-1.3.3-cp311-cp311-win32.whl", hash = "sha256:f30f1928162e189091cf4d9da2eac617bfe78ef907a761614ff577ef4edfb3c8"}, - {file = "frozenlist-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:a6394d7dadd3cfe3f4b3b186e54d5d8504d44f2d58dcc89d693698e8b7132b32"}, - {file = "frozenlist-1.3.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8df3de3a9ab8325f94f646609a66cbeeede263910c5c0de0101079ad541af332"}, - {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0693c609e9742c66ba4870bcee1ad5ff35462d5ffec18710b4ac89337ff16e27"}, - {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd4210baef299717db0a600d7a3cac81d46ef0e007f88c9335db79f8979c0d3d"}, - {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:394c9c242113bfb4b9aa36e2b80a05ffa163a30691c7b5a29eba82e937895d5e"}, - {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6327eb8e419f7d9c38f333cde41b9ae348bec26d840927332f17e887a8dcb70d"}, - {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e24900aa13212e75e5b366cb9065e78bbf3893d4baab6052d1aca10d46d944c"}, - {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3843f84a6c465a36559161e6c59dce2f2ac10943040c2fd021cfb70d58c4ad56"}, - {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:84610c1502b2461255b4c9b7d5e9c48052601a8957cd0aea6ec7a7a1e1fb9420"}, - {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c21b9aa40e08e4f63a2f92ff3748e6b6c84d717d033c7b3438dd3123ee18f70e"}, - {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:efce6ae830831ab6a22b9b4091d411698145cb9b8fc869e1397ccf4b4b6455cb"}, - {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:40de71985e9042ca00b7953c4f41eabc3dc514a2d1ff534027f091bc74416401"}, - {file = "frozenlist-1.3.3-cp37-cp37m-win32.whl", hash = "sha256:180c00c66bde6146a860cbb81b54ee0df350d2daf13ca85b275123bbf85de18a"}, - {file = "frozenlist-1.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9bbbcedd75acdfecf2159663b87f1bb5cfc80e7cd99f7ddd9d66eb98b14a8411"}, - {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:034a5c08d36649591be1cbb10e09da9f531034acfe29275fc5454a3b101ce41a"}, - {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ba64dc2b3b7b158c6660d49cdb1d872d1d0bf4e42043ad8d5006099479a194e5"}, - {file = "frozenlist-1.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47df36a9fe24054b950bbc2db630d508cca3aa27ed0566c0baf661225e52c18e"}, - {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:008a054b75d77c995ea26629ab3a0c0d7281341f2fa7e1e85fa6153ae29ae99c"}, - {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:841ea19b43d438a80b4de62ac6ab21cfe6827bb8a9dc62b896acc88eaf9cecba"}, - {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e235688f42b36be2b6b06fc37ac2126a73b75fb8d6bc66dd632aa35286238703"}, - {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca713d4af15bae6e5d79b15c10c8522859a9a89d3b361a50b817c98c2fb402a2"}, - {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ac5995f2b408017b0be26d4a1d7c61bce106ff3d9e3324374d66b5964325448"}, - {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4ae8135b11652b08a8baf07631d3ebfe65a4c87909dbef5fa0cdde440444ee4"}, - {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4ea42116ceb6bb16dbb7d526e242cb6747b08b7710d9782aa3d6732bd8d27649"}, - {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:810860bb4bdce7557bc0febb84bbd88198b9dbc2022d8eebe5b3590b2ad6c842"}, - {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ee78feb9d293c323b59a6f2dd441b63339a30edf35abcb51187d2fc26e696d13"}, - {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0af2e7c87d35b38732e810befb9d797a99279cbb85374d42ea61c1e9d23094b3"}, - {file = "frozenlist-1.3.3-cp38-cp38-win32.whl", hash = "sha256:899c5e1928eec13fd6f6d8dc51be23f0d09c5281e40d9cf4273d188d9feeaf9b"}, - {file = "frozenlist-1.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:7f44e24fa70f6fbc74aeec3e971f60a14dde85da364aa87f15d1be94ae75aeef"}, - {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2b07ae0c1edaa0a36339ec6cce700f51b14a3fc6545fdd32930d2c83917332cf"}, - {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ebb86518203e12e96af765ee89034a1dbb0c3c65052d1b0c19bbbd6af8a145e1"}, - {file = "frozenlist-1.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cf820485f1b4c91e0417ea0afd41ce5cf5965011b3c22c400f6d144296ccbc0"}, - {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c11e43016b9024240212d2a65043b70ed8dfd3b52678a1271972702d990ac6d"}, - {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8fa3c6e3305aa1146b59a09b32b2e04074945ffcfb2f0931836d103a2c38f936"}, - {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:352bd4c8c72d508778cf05ab491f6ef36149f4d0cb3c56b1b4302852255d05d5"}, - {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65a5e4d3aa679610ac6e3569e865425b23b372277f89b5ef06cf2cdaf1ebf22b"}, - {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e2c1185858d7e10ff045c496bbf90ae752c28b365fef2c09cf0fa309291669"}, - {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f163d2fd041c630fed01bc48d28c3ed4a3b003c00acd396900e11ee5316b56bb"}, - {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:05cdb16d09a0832eedf770cb7bd1fe57d8cf4eaf5aced29c4e41e3f20b30a784"}, - {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8bae29d60768bfa8fb92244b74502b18fae55a80eac13c88eb0b496d4268fd2d"}, - {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eedab4c310c0299961ac285591acd53dc6723a1ebd90a57207c71f6e0c2153ab"}, - {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3bbdf44855ed8f0fbcd102ef05ec3012d6a4fd7c7562403f76ce6a52aeffb2b1"}, - {file = "frozenlist-1.3.3-cp39-cp39-win32.whl", hash = "sha256:efa568b885bca461f7c7b9e032655c0c143d305bf01c30caf6db2854a4532b38"}, - {file = "frozenlist-1.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:cfe33efc9cb900a4c46f91a5ceba26d6df370ffddd9ca386eb1d4f0ad97b9ea9"}, - {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"}, + {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"}, + {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"}, + {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"}, + {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"}, + {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"}, + {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"}, + {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"}, + {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"}, + {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"}, + {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"}, + {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"}, + {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"}, + {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"}, + {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"}, + {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"}, + {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"}, + {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"}, + {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"}, + {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"}, + {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"}, + {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"}, ] [[package]] @@ -472,6 +471,82 @@ files = [ [package.extras] rewrite = ["tokenize-rt (>=3)"] +[[package]] +name = "google-auth" +version = "2.22.0" +description = "Google Authentication Library" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "google-auth-2.22.0.tar.gz", hash = "sha256:164cba9af4e6e4e40c3a4f90a1a6c12ee56f14c0b4868d1ca91b32826ab334ce"}, + {file = "google_auth-2.22.0-py2.py3-none-any.whl", hash = "sha256:d61d1b40897407b574da67da1a833bdc10d5a11642566e506565d1b1a46ba873"}, +] + +[package.dependencies] +cachetools = ">=2.0.0,<6.0" +pyasn1-modules = ">=0.2.1" +rsa = ">=3.1.4,<5" +six = ">=1.9.0" +urllib3 = "<2.0" + +[package.extras] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"] +enterprise-cert = ["cryptography (==36.0.2)", "pyopenssl (==22.0.0)"] +pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"] +reauth = ["pyu2f (>=0.1.5)"] +requests = ["requests (>=2.20.0,<3.0.0.dev0)"] + +[[package]] +name = "google-auth-oauthlib" +version = "1.0.0" +description = "Google Authentication Library" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "google-auth-oauthlib-1.0.0.tar.gz", hash = "sha256:e375064964820b47221a7e1b7ee1fd77051b6323c3f9e3e19785f78ab67ecfc5"}, + {file = "google_auth_oauthlib-1.0.0-py2.py3-none-any.whl", hash = "sha256:95880ca704928c300f48194d1770cf5b1462835b6e49db61445a520f793fd5fb"}, +] + +[package.dependencies] +google-auth = ">=2.15.0" +requests-oauthlib = ">=0.7.0" + +[package.extras] +tool = ["click (>=6.0.0)"] + +[[package]] +name = "gspread" +version = "5.10.0" +description = "Google Spreadsheets Python API" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "gspread-5.10.0-py3-none-any.whl", hash = "sha256:f58b15d950ef5c45c8607edb3687188d5a543f2b66242f9c26fbb0d2cf36169d"}, + {file = "gspread-5.10.0.tar.gz", hash = "sha256:2b6bba6dc111580170346a9bcd1893e0e8c52f67a9e537caec7b7a1e27c14435"}, +] + +[package.dependencies] +google-auth = ">=1.12.0" +google-auth-oauthlib = ">=0.4.1" + +[[package]] +name = "httplib2" +version = "0.22.0" +description = "A comprehensive HTTP client library." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc"}, + {file = "httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81"}, +] + +[package.dependencies] +pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""} + [[package]] name = "idna" version = "3.4" @@ -682,6 +757,77 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx- extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] +[[package]] +name = "numpy" +version = "1.25.1" +description = "Fundamental package for array computing in Python" +category = "dev" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d339465dff3eb33c701430bcb9c325b60354698340229e1dff97745e6b3efa"}, + {file = "numpy-1.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d736b75c3f2cb96843a5c7f8d8ccc414768d34b0a75f466c05f3a739b406f10b"}, + {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a90725800caeaa160732d6b31f3f843ebd45d6b5f3eec9e8cc287e30f2805bf"}, + {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6c9261d21e617c6dc5eacba35cb68ec36bb72adcff0dee63f8fbc899362588"}, + {file = "numpy-1.25.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0def91f8af6ec4bb94c370e38c575855bf1d0be8a8fbfba42ef9c073faf2cf19"}, + {file = "numpy-1.25.1-cp310-cp310-win32.whl", hash = "sha256:fd67b306320dcadea700a8f79b9e671e607f8696e98ec255915c0c6d6b818503"}, + {file = "numpy-1.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:c1516db588987450b85595586605742879e50dcce923e8973f79529651545b57"}, + {file = "numpy-1.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b82655dd8efeea69dbf85d00fca40013d7f503212bc5259056244961268b66e"}, + {file = "numpy-1.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f6049c4878cb16960fbbfb22105e49d13d752d4d8371b55110941fb3b17800"}, + {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41a56b70e8139884eccb2f733c2f7378af06c82304959e174f8e7370af112e09"}, + {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5154b1a25ec796b1aee12ac1b22f414f94752c5f94832f14d8d6c9ac40bcca6"}, + {file = "numpy-1.25.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38eb6548bb91c421261b4805dc44def9ca1a6eef6444ce35ad1669c0f1a3fc5d"}, + {file = "numpy-1.25.1-cp311-cp311-win32.whl", hash = "sha256:791f409064d0a69dd20579345d852c59822c6aa087f23b07b1b4e28ff5880fcb"}, + {file = "numpy-1.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:c40571fe966393b212689aa17e32ed905924120737194b5d5c1b20b9ed0fb171"}, + {file = "numpy-1.25.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d7abcdd85aea3e6cdddb59af2350c7ab1ed764397f8eec97a038ad244d2d105"}, + {file = "numpy-1.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a180429394f81c7933634ae49b37b472d343cccb5bb0c4a575ac8bbc433722f"}, + {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d412c1697c3853c6fc3cb9751b4915859c7afe6a277c2bf00acf287d56c4e625"}, + {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e1266411120a4f16fad8efa8e0454d21d00b8c7cee5b5ccad7565d95eb42dd"}, + {file = "numpy-1.25.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f76aebc3358ade9eacf9bc2bb8ae589863a4f911611694103af05346637df1b7"}, + {file = "numpy-1.25.1-cp39-cp39-win32.whl", hash = "sha256:247d3ffdd7775bdf191f848be8d49100495114c82c2bd134e8d5d075fb386a1c"}, + {file = "numpy-1.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:1d5d3c68e443c90b38fdf8ef40e60e2538a27548b39b12b73132456847f4b631"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:35a9527c977b924042170a0887de727cd84ff179e478481404c5dc66b4170009"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d3fe3dd0506a28493d82dc3cf254be8cd0d26f4008a417385cbf1ae95b54004"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:012097b5b0d00a11070e8f2e261128c44157a8689f7dedcf35576e525893f4fe"}, + {file = "numpy-1.25.1.tar.gz", hash = "sha256:9a3a9f3a61480cc086117b426a8bd86869c213fc4072e606f01c4e4b66eb92bf"}, +] + +[[package]] +name = "oauth2client" +version = "4.1.3" +description = "OAuth 2.0 client library" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "oauth2client-4.1.3-py2.py3-none-any.whl", hash = "sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac"}, + {file = "oauth2client-4.1.3.tar.gz", hash = "sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6"}, +] + +[package.dependencies] +httplib2 = ">=0.9.1" +pyasn1 = ">=0.1.7" +pyasn1-modules = ">=0.0.5" +rsa = ">=3.1.4" +six = ">=1.6.1" + +[[package]] +name = "oauthlib" +version = "3.2.2" +description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"}, + {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"}, +] + +[package.extras] +rsa = ["cryptography (>=3.0.0)"] +signals = ["blinker (>=1.4.0)"] +signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] + [[package]] name = "openai" version = "0.27.8" @@ -717,6 +863,73 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pandas" +version = "2.0.3" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "dev" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] + [[package]] name = "pathspec" version = "0.11.1" @@ -746,14 +959,14 @@ ptyprocess = ">=0.5" [[package]] name = "platformdirs" -version = "3.8.0" +version = "3.9.1" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "platformdirs-3.8.0-py3-none-any.whl", hash = "sha256:ca9ed98ce73076ba72e092b23d3c93ea6c4e186b3f1c3dad6edd98ff6ffcca2e"}, - {file = "platformdirs-3.8.0.tar.gz", hash = "sha256:b0cabcb11063d21a0b261d557acb0a9d2126350e63b70cdf7db6347baea456dc"}, + {file = "platformdirs-3.9.1-py3-none-any.whl", hash = "sha256:ad8291ae0ae5072f66c16945166cb11c63394c7a3ad1b1bc9828ca3162da8c2f"}, + {file = "platformdirs-3.9.1.tar.gz", hash = "sha256:1b42b450ad933e981d56e59f1b97495428c9bd60698baab9f3eb3d00d5822421"}, ] [package.extras] @@ -788,6 +1001,33 @@ files = [ {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, ] +[[package]] +name = "pyasn1" +version = "0.5.0" +description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "pyasn1-0.5.0-py2.py3-none-any.whl", hash = "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57"}, + {file = "pyasn1-0.5.0.tar.gz", hash = "sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde"}, +] + +[[package]] +name = "pyasn1-modules" +version = "0.3.0" +description = "A collection of ASN.1-based protocols modules" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "pyasn1_modules-0.3.0-py2.py3-none-any.whl", hash = "sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d"}, + {file = "pyasn1_modules-0.3.0.tar.gz", hash = "sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c"}, +] + +[package.dependencies] +pyasn1 = ">=0.4.6,<0.6.0" + [[package]] name = "pycodestyle" version = "2.7.0" @@ -802,48 +1042,48 @@ files = [ [[package]] name = "pydantic" -version = "1.10.10" +version = "1.10.11" description = "Data validation and settings management using python type hints" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "pydantic-1.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:adad1ee4ab9888f12dac2529276704e719efcf472e38df7813f5284db699b4ec"}, - {file = "pydantic-1.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a7db03339893feef2092ff7b1afc9497beed15ebd4af84c3042a74abce02d48"}, - {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b3714b97ff84b2689654851c2426389bcabfac9080617bcf4306c69db606f6"}, - {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edfdf0a5abc5c9bf2052ebaec20e67abd52e92d257e4f2d30e02c354ed3e6030"}, - {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20a3b30fd255eeeb63caa9483502ba96b7795ce5bf895c6a179b3d909d9f53a6"}, - {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db4c7f7e60ca6f7d6c1785070f3e5771fcb9b2d88546e334d2f2c3934d949028"}, - {file = "pydantic-1.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:a2d5be50ac4a0976817144c7d653e34df2f9436d15555189f5b6f61161d64183"}, - {file = "pydantic-1.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:566a04ba755e8f701b074ffb134ddb4d429f75d5dced3fbd829a527aafe74c71"}, - {file = "pydantic-1.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f79db3652ed743309f116ba863dae0c974a41b688242482638b892246b7db21d"}, - {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c62376890b819bebe3c717a9ac841a532988372b7e600e76f75c9f7c128219d5"}, - {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4870f13a4fafd5bc3e93cff3169222534fad867918b188e83ee0496452978437"}, - {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:990027e77cda6072a566e433b6962ca3b96b4f3ae8bd54748e9d62a58284d9d7"}, - {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8c40964596809eb616d94f9c7944511f620a1103d63d5510440ed2908fc410af"}, - {file = "pydantic-1.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:ea9eebc2ebcba3717e77cdeee3f6203ffc0e78db5f7482c68b1293e8cc156e5e"}, - {file = "pydantic-1.10.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:762aa598f79b4cac2f275d13336b2dd8662febee2a9c450a49a2ab3bec4b385f"}, - {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dab5219659f95e357d98d70577b361383057fb4414cfdb587014a5f5c595f7b"}, - {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3d4ee957a727ccb5a36f1b0a6dbd9fad5dedd2a41eada99a8df55c12896e18d"}, - {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b69f9138dec566962ec65623c9d57bee44412d2fc71065a5f3ebb3820bdeee96"}, - {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7aa75d1bd9cc275cf9782f50f60cddaf74cbaae19b6ada2a28e737edac420312"}, - {file = "pydantic-1.10.10-cp37-cp37m-win_amd64.whl", hash = "sha256:9f62a727f5c590c78c2d12fda302d1895141b767c6488fe623098f8792255fe5"}, - {file = "pydantic-1.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aac218feb4af73db8417ca7518fb3bade4534fcca6e3fb00f84966811dd94450"}, - {file = "pydantic-1.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:88546dc10a40b5b52cae87d64666787aeb2878f9a9b37825aedc2f362e7ae1da"}, - {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c41bbaae89e32fc582448e71974de738c055aef5ab474fb25692981a08df808a"}, - {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b71bd504d1573b0b722ae536e8ffb796bedeef978979d076bf206e77dcc55a5"}, - {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e088e3865a2270ecbc369924cd7d9fbc565667d9158e7f304e4097ebb9cf98dd"}, - {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3403a090db45d4027d2344859d86eb797484dfda0706cf87af79ace6a35274ef"}, - {file = "pydantic-1.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:e0014e29637125f4997c174dd6167407162d7af0da73414a9340461ea8573252"}, - {file = "pydantic-1.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9965e49c6905840e526e5429b09e4c154355b6ecc0a2f05492eda2928190311d"}, - {file = "pydantic-1.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:748d10ab6089c5d196e1c8be9de48274f71457b01e59736f7a09c9dc34f51887"}, - {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86936c383f7c38fd26d35107eb669c85d8f46dfceae873264d9bab46fe1c7dde"}, - {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a26841be620309a9697f5b1ffc47dce74909e350c5315ccdac7a853484d468a"}, - {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:409b810f387610cc7405ab2fa6f62bdf7ea485311845a242ebc0bd0496e7e5ac"}, - {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ce937a2a2c020bcad1c9fde02892392a1123de6dda906ddba62bfe8f3e5989a2"}, - {file = "pydantic-1.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:37ebddef68370e6f26243acc94de56d291e01227a67b2ace26ea3543cf53dd5f"}, - {file = "pydantic-1.10.10-py3-none-any.whl", hash = "sha256:a5939ec826f7faec434e2d406ff5e4eaf1716eb1f247d68cd3d0b3612f7b4c8a"}, - {file = "pydantic-1.10.10.tar.gz", hash = "sha256:3b8d5bd97886f9eb59260594207c9f57dce14a6f869c6ceea90188715d29921a"}, + {file = "pydantic-1.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ff44c5e89315b15ff1f7fdaf9853770b810936d6b01a7bcecaa227d2f8fe444f"}, + {file = "pydantic-1.10.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a6c098d4ab5e2d5b3984d3cb2527e2d6099d3de85630c8934efcfdc348a9760e"}, + {file = "pydantic-1.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16928fdc9cb273c6af00d9d5045434c39afba5f42325fb990add2c241402d151"}, + {file = "pydantic-1.10.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0588788a9a85f3e5e9ebca14211a496409cb3deca5b6971ff37c556d581854e7"}, + {file = "pydantic-1.10.11-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e9baf78b31da2dc3d3f346ef18e58ec5f12f5aaa17ac517e2ffd026a92a87588"}, + {file = "pydantic-1.10.11-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:373c0840f5c2b5b1ccadd9286782852b901055998136287828731868027a724f"}, + {file = "pydantic-1.10.11-cp310-cp310-win_amd64.whl", hash = "sha256:c3339a46bbe6013ef7bdd2844679bfe500347ac5742cd4019a88312aa58a9847"}, + {file = "pydantic-1.10.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:08a6c32e1c3809fbc49debb96bf833164f3438b3696abf0fbeceb417d123e6eb"}, + {file = "pydantic-1.10.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a451ccab49971af043ec4e0d207cbc8cbe53dbf148ef9f19599024076fe9c25b"}, + {file = "pydantic-1.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b02d24f7b2b365fed586ed73582c20f353a4c50e4be9ba2c57ab96f8091ddae"}, + {file = "pydantic-1.10.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f34739a89260dfa420aa3cbd069fbcc794b25bbe5c0a214f8fb29e363484b66"}, + {file = "pydantic-1.10.11-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e297897eb4bebde985f72a46a7552a7556a3dd11e7f76acda0c1093e3dbcf216"}, + {file = "pydantic-1.10.11-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d185819a7a059550ecb85d5134e7d40f2565f3dd94cfd870132c5f91a89cf58c"}, + {file = "pydantic-1.10.11-cp311-cp311-win_amd64.whl", hash = "sha256:4400015f15c9b464c9db2d5d951b6a780102cfa5870f2c036d37c23b56f7fc1b"}, + {file = "pydantic-1.10.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2417de68290434461a266271fc57274a138510dca19982336639484c73a07af6"}, + {file = "pydantic-1.10.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:331c031ba1554b974c98679bd0780d89670d6fd6f53f5d70b10bdc9addee1713"}, + {file = "pydantic-1.10.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8268a735a14c308923e8958363e3a3404f6834bb98c11f5ab43251a4e410170c"}, + {file = "pydantic-1.10.11-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:44e51ba599c3ef227e168424e220cd3e544288c57829520dc90ea9cb190c3248"}, + {file = "pydantic-1.10.11-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d7781f1d13b19700b7949c5a639c764a077cbbdd4322ed505b449d3ca8edcb36"}, + {file = "pydantic-1.10.11-cp37-cp37m-win_amd64.whl", hash = "sha256:7522a7666157aa22b812ce14c827574ddccc94f361237ca6ea8bb0d5c38f1629"}, + {file = "pydantic-1.10.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc64eab9b19cd794a380179ac0e6752335e9555d214cfcb755820333c0784cb3"}, + {file = "pydantic-1.10.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8dc77064471780262b6a68fe67e013298d130414d5aaf9b562c33987dbd2cf4f"}, + {file = "pydantic-1.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe429898f2c9dd209bd0632a606bddc06f8bce081bbd03d1c775a45886e2c1cb"}, + {file = "pydantic-1.10.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:192c608ad002a748e4a0bed2ddbcd98f9b56df50a7c24d9a931a8c5dd053bd3d"}, + {file = "pydantic-1.10.11-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef55392ec4bb5721f4ded1096241e4b7151ba6d50a50a80a2526c854f42e6a2f"}, + {file = "pydantic-1.10.11-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e0bb6efe86281623abbeeb0be64eab740c865388ee934cd3e6a358784aca6e"}, + {file = "pydantic-1.10.11-cp38-cp38-win_amd64.whl", hash = "sha256:265a60da42f9f27e0b1014eab8acd3e53bd0bad5c5b4884e98a55f8f596b2c19"}, + {file = "pydantic-1.10.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:469adf96c8e2c2bbfa655fc7735a2a82f4c543d9fee97bd113a7fb509bf5e622"}, + {file = "pydantic-1.10.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e6cbfbd010b14c8a905a7b10f9fe090068d1744d46f9e0c021db28daeb8b6de1"}, + {file = "pydantic-1.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abade85268cc92dff86d6effcd917893130f0ff516f3d637f50dadc22ae93999"}, + {file = "pydantic-1.10.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9738b0f2e6c70f44ee0de53f2089d6002b10c33264abee07bdb5c7f03038303"}, + {file = "pydantic-1.10.11-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:787cf23e5a0cde753f2eabac1b2e73ae3844eb873fd1f5bdbff3048d8dbb7604"}, + {file = "pydantic-1.10.11-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:174899023337b9fc685ac8adaa7b047050616136ccd30e9070627c1aaab53a13"}, + {file = "pydantic-1.10.11-cp39-cp39-win_amd64.whl", hash = "sha256:1954f8778489a04b245a1e7b8b22a9d3ea8ef49337285693cf6959e4b757535e"}, + {file = "pydantic-1.10.11-py3-none-any.whl", hash = "sha256:008c5e266c8aada206d0627a011504e14268a62091450210eda7c07fabe6963e"}, + {file = "pydantic-1.10.11.tar.gz", hash = "sha256:f66d479cf7eb331372c470614be6511eae96f1f120344c25f3f9bb59fb1b5528"}, ] [package.dependencies] @@ -865,6 +1105,21 @@ files = [ {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, ] +[[package]] +name = "pyparsing" +version = "3.1.0" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "dev" +optional = false +python-versions = ">=3.6.8" +files = [ + {file = "pyparsing-3.1.0-py3-none-any.whl", hash = "sha256:d554a96d1a7d3ddaf7183104485bc19fd80543ad6ac5bdb6426719d766fb06c1"}, + {file = "pyparsing-3.1.0.tar.gz", hash = "sha256:edb662d6fe322d6e990b1594b5feaeadf806803359e3d4d42f11e295e588f0ea"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + [[package]] name = "pytest" version = "7.4.0" @@ -906,6 +1161,21 @@ future-fstrings = "*" networkx = "*" pytest = ">=3" +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "python-dotenv" version = "0.21.1" @@ -921,6 +1191,18 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "pytz" +version = "2023.3" +description = "World timezone definitions, modern and historical" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"}, + {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -943,6 +1225,52 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-oauthlib" +version = "1.3.1" +description = "OAuthlib authentication support for Requests." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"}, + {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"}, +] + +[package.dependencies] +oauthlib = ">=3.0.0" +requests = ">=2.0.0" + +[package.extras] +rsa = ["oauthlib[signedtoken] (>=3.0.0)"] + +[[package]] +name = "rsa" +version = "4.9" +description = "Pure-Python RSA implementation" +category = "dev" +optional = false +python-versions = ">=3.6,<4" +files = [ + {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, + {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, +] + +[package.dependencies] +pyasn1 = ">=0.1.3" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "toml" version = "0.10.2" @@ -1027,23 +1355,34 @@ files = [ {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +category = "dev" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] + [[package]] name = "urllib3" -version = "2.0.3" +version = "1.26.16" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ - {file = "urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"}, - {file = "urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"}, + {file = "urllib3-1.26.16-py2.py3-none-any.whl", hash = "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f"}, + {file = "urllib3-1.26.16.tar.gz", hash = "sha256:8f135f6502756bde6b2a9b28989df5fbe87c9970cecaa69041edcce7f0589b14"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] name = "yarl" @@ -1136,4 +1475,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "09871e879785f0a7d5c31a61553cd2df08d88324a864b9c56b8e97d95893157f" +content-hash = "4dbf4bdf1965f80ad6ae3c27c6dab58e9ccf2e0fd154c8380e2df9e30455ffcd" diff --git a/pyproject.toml b/pyproject.toml index 48be9cf5d..cf0504d62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,9 @@ mypy = "^0.910" isort = "^5.9.3" black = "22.3" autoflake = "^1.4" +pandas = "^2.0.3" +gspread = "^5.10.0" +oauth2client = "^4.1.3" [build-system] requires = ["poetry-core"] diff --git a/send_to_googledrive.py b/send_to_googledrive.py new file mode 100644 index 000000000..aa074ea6a --- /dev/null +++ b/send_to_googledrive.py @@ -0,0 +1,112 @@ +import base64 +import json +import os + +import gspread +import pandas as pd +from dotenv import load_dotenv +from oauth2client.service_account import ServiceAccountCredentials + +# Load environment variables from .env file +load_dotenv() + +# Get the base64 string from the environment variable +base64_creds = os.getenv("GDRIVE_BASE64") + +if base64_creds is None: + raise ValueError("The GDRIVE_BASE64 environment variable is not set") + +# Decode the base64 string into bytes +creds_bytes = base64.b64decode(base64_creds) + +# Convert the bytes into a string +creds_string = creds_bytes.decode("utf-8") + +# Parse the string into a JSON object +creds_info = json.loads(creds_string) + +# Define the base directory containing JSON files +base_dir = "reports" + +# Create a list to store each row of data +rows = [] + +# Loop over each directory in the base directory +for sub_dir in os.listdir(base_dir): + # Define the subdirectory path + sub_dir_path = os.path.join(base_dir, sub_dir) + + # Ensure the sub_dir_path is a directory + if os.path.isdir(sub_dir_path): + # Loop over each file in the subdirectory + for filename in os.listdir(sub_dir_path): + # Check if the file is a JSON file + if filename.endswith(".json"): + # Define the file path + file_path = os.path.join(sub_dir_path, filename) + + # Load the JSON data from the file + with open(file_path, "r") as f: + data = json.load(f) + + # Loop through each test + for test_name, test_info in data["tests"].items(): + # Create a dictionary to hold the information for this row + row = { + "Agent": sub_dir, + "Command": data.get("command", ""), + "Completion Time": data.get("completion_time", ""), + "Total Run Time": data.get("metrics", {}).get("run_time", ""), + "Highest Difficulty": data.get("metrics", {}).get( + "highest_difficulty", "" + ), + "Workspace": data.get("config", {}).get("workspace", ""), + "Test Name": test_name, + "Data Path": test_info.get("data_path", ""), + "Is Regression": test_info.get("is_regression", ""), + "Difficulty": test_info.get("metrics", {}).get( + "difficulty", "" + ), + "Success": test_info.get("metrics", {}).get("success", ""), + "Success %": test_info.get("metrics", {}).get("success_%", ""), + "Non mock success %": test_info.get("metrics", {}).get( + "non_mock_success_%", "" + ), + "Run Time": test_info.get("metrics", {}).get("run_time", ""), + } + + # Add this row to the list + rows.append(row) + +# Convert the list of rows into a DataFrame +df = pd.DataFrame(rows) + +# Define the scope +scope = [ + "https://spreadsheets.google.com/feeds", + "https://www.googleapis.com/auth/drive", +] + +# Add your service account credentials +creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_info, scope) + +# Authorize the clientsheet +client = gspread.authorize(creds) + +# Get the instance of the Spreadsheet +sheet = client.open("benchmark") + +# Get the first sheet of the Spreadsheet +sheet_instance = sheet.get_worksheet(0) + +# Convert dataframe to list of lists for uploading to Google Sheets +values = df.values.tolist() + +# Prepend the header to the values list +values.insert(0, df.columns.tolist()) + +# Clear the existing values in the worksheet +sheet_instance.clear() + +# Update the worksheet with the new values +sheet_instance.append_rows(values) -- cgit v1.2.3 From 953060335d028c113f8180dcbf28362fda8b166d Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 16:20:12 +0000 Subject: smol-developer-20230718162012 --- reports/smol-developer/file4_07-18-16-19.json | 266 ++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 reports/smol-developer/file4_07-18-16-19.json diff --git a/reports/smol-developer/file4_07-18-16-19.json b/reports/smol-developer/file4_07-18-16-19.json new file mode 100644 index 000000000..43fd780f8 --- /dev/null +++ b/reports/smol-developer/file4_07-18-16-19.json @@ -0,0 +1,266 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-16:20", + "metrics": { + "run_time": "31.71 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "5.187 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "7.488 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "11.614 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "6.074 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.164 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From 52e7f92e54f2276b40e83a8aeb6a1da3a9ab2aab Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 16:20:39 +0000 Subject: Auto-GPT-20230718162039 --- reports/Auto-GPT/file4_07-18-16-20.json | 268 ++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 reports/Auto-GPT/file4_07-18-16-20.json diff --git a/reports/Auto-GPT/file4_07-18-16-20.json b/reports/Auto-GPT/file4_07-18-16-20.json new file mode 100644 index 000000000..f7d6d7cb6 --- /dev/null +++ b/reports/Auto-GPT/file4_07-18-16-20.json @@ -0,0 +1,268 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-16:20", + "metrics": { + "run_time": "21.6 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "21.346 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From ef1ded34602d372b3134fa8026cb3cb0dd1f49e5 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 16:21:54 +0000 Subject: gpt-engineer-20230718162154 --- reports/gpt-engineer/file4_07-18-16-19.json | 267 ++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 reports/gpt-engineer/file4_07-18-16-19.json diff --git a/reports/gpt-engineer/file4_07-18-16-19.json b/reports/gpt-engineer/file4_07-18-16-19.json new file mode 100644 index 000000000..2fdba0555 --- /dev/null +++ b/reports/gpt-engineer/file4_07-18-16-19.json @@ -0,0 +1,267 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-16:21", + "metrics": { + "run_time": "124.12 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "68.605 seconds" + }, + "reached_cutoff": true + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'", + "success_%": 0.0, + "run_time": "53.647 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "0.772 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'", + "success_%": 0.0, + "run_time": "0.811 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "projects/my-new-project/workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From fb1c34ec9f2e0078e53c856e4046ee3658c5f4c6 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Tue, 18 Jul 2023 16:27:05 +0000 Subject: mini-agi-20230718162705 --- reports/mini-agi/file2_07-18-16-20.json | 260 ++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 reports/mini-agi/file2_07-18-16-20.json diff --git a/reports/mini-agi/file2_07-18-16-20.json b/reports/mini-agi/file2_07-18-16-20.json new file mode 100644 index 000000000..e8cde442d --- /dev/null +++ b/reports/mini-agi/file2_07-18-16-20.json @@ -0,0 +1,260 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-18-16:26", + "metrics": { + "run_time": "400.95 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": true, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "15.813 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "32.591 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": true, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "22.01 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": true, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "25.419 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "59.541 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": true, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "25.535 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 20.0, + "run_time": "49.13 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: '/home/runner/miniagi/result.txt'", + "success_%": 66.67, + "run_time": "57.587 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 66.67, + "run_time": "22.668 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "11.021 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "53.932 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 25.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "25.51 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 33.33, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "${os.path.join(Path.home(), 'miniagi')}" + } +} \ No newline at end of file -- cgit v1.2.3 From 0c94bb5f2510661762e9406b8b5bce094d6249c0 Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Tue, 18 Jul 2023 22:34:52 +0200 Subject: Fix configuring TTS engine (#5005) --- autogpt/config/config.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/autogpt/config/config.py b/autogpt/config/config.py index cb3f26d3e..b6773511d 100644 --- a/autogpt/config/config.py +++ b/autogpt/config/config.py @@ -277,16 +277,16 @@ class ConfigBuilder(Configurable[Config]): config_dict["elevenlabs_voice_id"] = os.getenv( "ELEVENLABS_VOICE_ID", os.getenv("ELEVENLABS_VOICE_1_ID") ) - elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY") - if os.getenv("USE_MAC_OS_TTS"): - default_tts_provider = "macos" - elif elevenlabs_api_key: - default_tts_provider = "elevenlabs" - elif os.getenv("USE_BRIAN_TTS"): - default_tts_provider = "streamelements" - else: - default_tts_provider = "gtts" - config_dict["text_to_speech_provider"] = default_tts_provider + if not config_dict["text_to_speech_provider"]: + if os.getenv("USE_MAC_OS_TTS"): + default_tts_provider = "macos" + elif config_dict["elevenlabs_api_key"]: + default_tts_provider = "elevenlabs" + elif os.getenv("USE_BRIAN_TTS"): + default_tts_provider = "streamelements" + else: + default_tts_provider = "gtts" + config_dict["text_to_speech_provider"] = default_tts_provider config_dict["plugins_allowlist"] = _safe_split(os.getenv("ALLOWLISTED_PLUGINS")) config_dict["plugins_denylist"] = _safe_split(os.getenv("DENYLISTED_PLUGINS")) -- cgit v1.2.3 From 5fe95adc069c89d5d2376a6835ae13ed1e743465 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 08:18:54 +0000 Subject: smol-developer-20230719081854 --- reports/smol-developer/file5_07-19-08-18.json | 266 ++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 reports/smol-developer/file5_07-19-08-18.json diff --git a/reports/smol-developer/file5_07-19-08-18.json b/reports/smol-developer/file5_07-19-08-18.json new file mode 100644 index 000000000..8e37c7651 --- /dev/null +++ b/reports/smol-developer/file5_07-19-08-18.json @@ -0,0 +1,266 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-08:18", + "metrics": { + "run_time": "40.84 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "7.054 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "9.593 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "14.527 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "7.886 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.513 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From 34309a6c00e1d1c42b99944138cf78fff26f13de Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 08:19:09 +0000 Subject: gpt-engineer-20230719081909 --- reports/gpt-engineer/file5_07-19-08-18.json | 268 ++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 reports/gpt-engineer/file5_07-19-08-18.json diff --git a/reports/gpt-engineer/file5_07-19-08-18.json b/reports/gpt-engineer/file5_07-19-08-18.json new file mode 100644 index 000000000..68fe4d03e --- /dev/null +++ b/reports/gpt-engineer/file5_07-19-08-18.json @@ -0,0 +1,268 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-08:19", + "metrics": { + "run_time": "49.52 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "49.338 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "projects/my-new-project/workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From f475631cddd22d0d5591c7dea40ba64c0fd3576d Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 08:22:04 +0000 Subject: Auto-GPT-20230719082204 --- reports/Auto-GPT/file5_07-19-08-18.json | 267 ++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 reports/Auto-GPT/file5_07-19-08-18.json diff --git a/reports/Auto-GPT/file5_07-19-08-18.json b/reports/Auto-GPT/file5_07-19-08-18.json new file mode 100644 index 000000000..25761cc08 --- /dev/null +++ b/reports/Auto-GPT/file5_07-19-08-18.json @@ -0,0 +1,267 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-08:21", + "metrics": { + "run_time": "219.63 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "42.055 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "90.246 seconds" + }, + "reached_cutoff": true + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "26.804 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "60.304 seconds" + }, + "reached_cutoff": true + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From dcdc0c97274586d599006ae02e969354eb9882b4 Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 19 Jul 2023 13:37:29 -0700 Subject: Integrate Beebot (#169) --- .github/workflows/ci.yml | 31 +++++++++++++++++++------------ .gitmodules | 4 ++++ agent/beebot | 1 + 3 files changed, 24 insertions(+), 12 deletions(-) create mode 160000 agent/beebot diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e34b2e864..261800450 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -71,7 +71,6 @@ jobs: min-python-version: "3.10" name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})" runs-on: ubuntu-latest - timeout-minutes: 10 strategy: fail-fast: false matrix: @@ -80,6 +79,7 @@ jobs: - "smol-developer" - "Auto-GPT" - "mini-agi" + - "beebot" cache-enabled: [ true, false ] steps: @@ -115,6 +115,7 @@ jobs: - name: Run regression tests run: | cd agent/$AGENT_NAME + prefix="" if [ "$AGENT_NAME" == "gpt-engineer" ]; then make install source venv/bin/activate @@ -135,6 +136,10 @@ jobs: cp config_template.yaml config.yaml sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml docker-compose up -d --build + elif [ "$AGENT_NAME" == "beebot" ]; then + poetry install + poetry run playwright install + prefix="poetry run " else echo "Unknown agent name: $AGENT_NAME" exit 1 @@ -143,19 +148,19 @@ jobs: pip install ../../dist/*.whl if [ "${GITHUB_EVENT_NAME}" == "pull_request" ]; then - agbenchmark start --maintain --mock - agbenchmark start --improve --mock - agbenchmark start --mock - agbenchmark start --mock --category=retrieval - agbenchmark start --mock --category=interface - agbenchmark start --mock --category=code - agbenchmark start --mock --category=memory - agbenchmark start --mock --category=iterate + ${prefix}agbenchmark start --maintain --mock + ${prefix}agbenchmark start --improve --mock + ${prefix}agbenchmark start --mock + ${prefix}agbenchmark start --mock --category=retrieval + ${prefix}agbenchmark start --mock --category=interface + ${prefix}agbenchmark start --mock --category=code + ${prefix}agbenchmark start --mock --category=memory + ${prefix}agbenchmark start --mock --category=iterate else curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start - agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved." + ${prefix}agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved." fi - + cd ../.. env: @@ -191,6 +196,8 @@ jobs: git fetch origin $current_branch git rebase origin/$current_branch git push origin HEAD - poetry run python send_to_googledrive.py + if [ "$current_branch" == "master" ]; then + poetry run python send_to_googledrive.py + fi env: GDRIVE_BASE64: ${{ secrets.GDRIVE_BASE64 }} diff --git a/.gitmodules b/.gitmodules index d2b71f9c4..9fefe0c06 100644 --- a/.gitmodules +++ b/.gitmodules @@ -18,3 +18,7 @@ path = agent/SuperAGI url = https://github.com/SilenNaihin/SuperAGI.git branch = benchmark-integration +[submodule "agent/beebot"] + path = agent/beebot + url = https://github.com/merwanehamadi/beebot.git + branch = benchmark-integration diff --git a/agent/beebot b/agent/beebot new file mode 160000 index 000000000..b9686b12d --- /dev/null +++ b/agent/beebot @@ -0,0 +1 @@ +Subproject commit b9686b12d317b26095d706665f0a43244d7afb7c -- cgit v1.2.3 From 2fcf5352b233618651bfdbad260e063cd662f14a Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 20:40:10 +0000 Subject: smol-developer-20230719204010 --- reports/smol-developer/file6_07-19-20-39.json | 266 ++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 reports/smol-developer/file6_07-19-20-39.json diff --git a/reports/smol-developer/file6_07-19-20-39.json b/reports/smol-developer/file6_07-19-20-39.json new file mode 100644 index 000000000..1a56f98f9 --- /dev/null +++ b/reports/smol-developer/file6_07-19-20-39.json @@ -0,0 +1,266 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-20:40", + "metrics": { + "run_time": "38.68 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "5.156 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "7.535 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "14.961 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "9.584 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.246 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From 147425837c54ce67453a9812acb5f8e615958489 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 20:40:50 +0000 Subject: Auto-GPT-20230719204050 --- reports/Auto-GPT/file6_07-19-20-40.json | 268 ++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 reports/Auto-GPT/file6_07-19-20-40.json diff --git a/reports/Auto-GPT/file6_07-19-20-40.json b/reports/Auto-GPT/file6_07-19-20-40.json new file mode 100644 index 000000000..715d2a276 --- /dev/null +++ b/reports/Auto-GPT/file6_07-19-20-40.json @@ -0,0 +1,268 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-20:40", + "metrics": { + "run_time": "23.24 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "22.992 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From ef684baa441472cb4bca99d16319cac17ba9b8e2 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 20:41:24 +0000 Subject: beebot-20230719204124 --- reports/beebot/file1_07-19-20-40.json | 268 ++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 reports/beebot/file1_07-19-20-40.json diff --git a/reports/beebot/file1_07-19-20-40.json b/reports/beebot/file1_07-19-20-40.json new file mode 100644 index 000000000..1f728f85a --- /dev/null +++ b/reports/beebot/file1_07-19-20-40.json @@ -0,0 +1,268 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-20:41", + "metrics": { + "run_time": "46.24 seconds", + "highest_difficulty": "No successful tests" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "46.006 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.003 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From df5c66d8a7642fdc473d878fc4dcf11e353f31da Mon Sep 17 00:00:00 2001 From: merwanehamadi Date: Wed, 19 Jul 2023 14:53:42 -0700 Subject: Change beebot submodule (#170) --- .gitmodules | 2 +- agent/beebot | 2 +- reports/beebot/file1_07-19-20-40.json | 268 ---------------------------------- 3 files changed, 2 insertions(+), 270 deletions(-) delete mode 100644 reports/beebot/file1_07-19-20-40.json diff --git a/.gitmodules b/.gitmodules index 9fefe0c06..389314b1d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -21,4 +21,4 @@ [submodule "agent/beebot"] path = agent/beebot url = https://github.com/merwanehamadi/beebot.git - branch = benchmark-integration + branch = master diff --git a/agent/beebot b/agent/beebot index b9686b12d..2e9291d93 160000 --- a/agent/beebot +++ b/agent/beebot @@ -1 +1 @@ -Subproject commit b9686b12d317b26095d706665f0a43244d7afb7c +Subproject commit 2e9291d93080890884bf63ae19d3533a960b69a3 diff --git a/reports/beebot/file1_07-19-20-40.json b/reports/beebot/file1_07-19-20-40.json deleted file mode 100644 index 1f728f85a..000000000 --- a/reports/beebot/file1_07-19-20-40.json +++ /dev/null @@ -1,268 +0,0 @@ -{ - "command": "agbenchmark start", - "completion_time": "2023-07-19-20:41", - "metrics": { - "run_time": "46.24 seconds", - "highest_difficulty": "No successful tests" - }, - "tests": { - "TestWriteFile": { - "data_path": "agbenchmark/challenges/interface/write_file", - "is_regression": false, - "task": "Print the the capital of America to a .txt file", - "answer": "Washington", - "description": "Tests the writing to file", - "metrics": { - "difficulty": "interface", - "success": false, - "fail_reason": "assert 1 in [0.0]", - "success_%": 0.0, - "run_time": "46.006 seconds" - }, - "reached_cutoff": false - }, - "TestBasicCodeGeneration": { - "data_path": "agbenchmark/challenges/code/d3_two_sum", - "is_regression": false, - "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", - "answer": "The two_sum function coded properly.", - "description": "Tests ability for the agent to create the two_sum function.", - "metrics": { - "difficulty": "advanced", - "success": false, - "fail_reason": "challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.003 seconds" - }, - "reached_cutoff": false - }, - "TestSearch": { - "data_path": "agbenchmark/challenges/interface/search", - "is_regression": false, - "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", - "answer": "This is a Heading\nThis is a paragraph.", - "description": "Tests if an llm can search", - "metrics": { - "difficulty": "interface", - "success": false, - "fail_reason": "challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.001 seconds" - }, - "reached_cutoff": false - }, - "TestReadFile": { - "data_path": "agbenchmark/challenges/interface/read_file", - "is_regression": false, - "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", - "answer": "random string Hello World!", - "description": "This reads the file quickly", - "metrics": { - "difficulty": "interface", - "success": false, - "fail_reason": "challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.001 seconds" - }, - "reached_cutoff": false - }, - "TestThreeSum": { - "data_path": "agbenchmark/challenges/code/d5_three_sum", - "is_regression": false, - "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", - "answer": "The three_sum function coded properly.", - "description": "Tests ability for the agent to create the three_sum function.", - "metrics": { - "difficulty": "advanced", - "success": false, - "fail_reason": "challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestBasicRetrieval": { - "data_path": "agbenchmark/challenges/retrieval/r1_book_price", - "is_regression": false, - "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", - "answer": "\u00a325.89", - "description": "Specifies specific website to retrieve website from.", - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestDebugSimpleTypoWithGuidance": { - "data_path": "agbenchmark/challenges/code/d1_debug", - "is_regression": false, - "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", - "answer": "[0, 1] [2, 5] [0, 3]", - "description": "Tests ability for the agent to debug python code with a simple typo in it.", - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestBasicMemory": { - "data_path": "agbenchmark/challenges/memory/m1_id", - "is_regression": false, - "task": "Follow the instructions in the instructions_1.txt file", - "answer": "2314", - "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", - "metrics": { - "difficulty": "basic", - "success": false, - "fail_reason": "challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestRetrieval2.0": { - "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", - "is_regression": false, - "task": "Write tesla's revenue in 2022 into a .txt file.", - "answer": "It was $81.462 billion in 2022.", - "description": "A no guardrails search for info", - "metrics": { - "difficulty": "novice", - "success": false, - "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestDebugSimpleTypoWithoutGuidance": { - "data_path": "agbenchmark/challenges/code/d2_vague", - "is_regression": false, - "task": "Make test.py run without errors.", - "answer": "[0, 1] [2, 5] [0, 3]", - "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", - "metrics": { - "difficulty": "novice", - "success": false, - "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestCreateSimpleWebServer": { - "data_path": "agbenchmark/challenges/code/d4_web_server", - "is_regression": false, - "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", - "answer": "GET localhost:8079/health responds with a 200 OK", - "description": "Tests ability for the agent to build a simple web server locally", - "metrics": { - "difficulty": "advanced", - "success": false, - "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.001 seconds" - }, - "reached_cutoff": false - }, - "TestRememberMultipleIds": { - "data_path": "agbenchmark/challenges/memory/m2_multiple", - "is_regression": false, - "task": "Follow the instructions in the instructions_1.txt file", - "answer": "3145\n3791\n9317\n9471", - "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", - "metrics": { - "difficulty": "novice", - "success": false, - "fail_reason": "challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.001 seconds" - }, - "reached_cutoff": false - }, - "TestRetrieval2.1": { - "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", - "is_regression": false, - "task": "Write Tesla's precise revenue in 2022 into a .txt file.", - "answer": "It was $81.462 billion in 2022.", - "description": "This one checks the accuracy of the information over r2", - "metrics": { - "difficulty": "novice", - "success": false, - "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.001 seconds" - }, - "reached_cutoff": false - }, - "TestRememberMultipleWithNoise": { - "data_path": "agbenchmark/challenges/memory/m3_noise", - "is_regression": false, - "task": "Follow the instructions in the instructions_1.txt file", - "answer": "3145\n3791\n9317\n9471", - "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "metrics": { - "difficulty": "intermediate", - "success": false, - "fail_reason": "challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestRetrieval3": { - "data_path": "agbenchmark/challenges/retrieval/r3", - "is_regression": false, - "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", - "description": "Tests ability to retrieve information.", - "metrics": { - "difficulty": "intermediate", - "success": false, - "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestRetrieval2.2": { - "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", - "is_regression": false, - "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", - "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", - "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", - "metrics": { - "difficulty": "intermediate", - "success": false, - "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.002 seconds" - }, - "reached_cutoff": false - }, - "TestRememberMultiplePhrasesWithNoise": { - "data_path": "agbenchmark/challenges/memory/m4_phrases", - "is_regression": false, - "task": "Follow the instructions in the instructions_1.txt file", - "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", - "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", - "metrics": { - "difficulty": "advanced", - "success": false, - "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", - "success_%": 0.0, - "run_time": "0.001 seconds" - }, - "reached_cutoff": false - } - }, - "config": { - "workspace": "workspace" - } -} \ No newline at end of file -- cgit v1.2.3 From aec0e2fe7af62a0bac2921e108d2ec3ee2e8b8dd Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 21:56:12 +0000 Subject: smol-developer-20230719215612 --- reports/smol-developer/file7_07-19-21-55.json | 266 ++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 reports/smol-developer/file7_07-19-21-55.json diff --git a/reports/smol-developer/file7_07-19-21-55.json b/reports/smol-developer/file7_07-19-21-55.json new file mode 100644 index 000000000..0ed5b94cd --- /dev/null +++ b/reports/smol-developer/file7_07-19-21-55.json @@ -0,0 +1,266 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-21:56", + "metrics": { + "run_time": "35.04 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "4.839 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "7.157 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "15.171 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "6.181 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.503 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From d14ccd71cdc1eada0675766a8982e430b7caf9a7 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 21:59:39 +0000 Subject: Auto-GPT-20230719215939 --- reports/Auto-GPT/file7_07-19-21-56.json | 267 ++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 reports/Auto-GPT/file7_07-19-21-56.json diff --git a/reports/Auto-GPT/file7_07-19-21-56.json b/reports/Auto-GPT/file7_07-19-21-56.json new file mode 100644 index 000000000..636cb642f --- /dev/null +++ b/reports/Auto-GPT/file7_07-19-21-56.json @@ -0,0 +1,267 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-21:59", + "metrics": { + "run_time": "169.14 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "49.739 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "16.504 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "26.102 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "76.482 seconds" + }, + "reached_cutoff": true + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "autogpt/workspace/auto_gpt_workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 71ff6f1b8ce2a860f2b97f85726b69aac365170a Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 22:02:29 +0000 Subject: gpt-engineer-20230719220229 --- reports/gpt-engineer/file6_07-19-21-55.json | 264 ++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 reports/gpt-engineer/file6_07-19-21-55.json diff --git a/reports/gpt-engineer/file6_07-19-21-55.json b/reports/gpt-engineer/file6_07-19-21-55.json new file mode 100644 index 000000000..aa91aeaad --- /dev/null +++ b/reports/gpt-engineer/file6_07-19-21-55.json @@ -0,0 +1,264 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-22:02", + "metrics": { + "run_time": "403.03 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "64.853 seconds" + }, + "reached_cutoff": true + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "70.097 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "74.87 seconds" + }, + "reached_cutoff": true + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'", + "success_%": 0.0, + "run_time": "65.049 seconds" + }, + "reached_cutoff": true + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "85.607 seconds" + }, + "reached_cutoff": true + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "42.365 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "projects/my-new-project/workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 204af6597db9eb47b3252b252470559853939cbe Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Wed, 19 Jul 2023 22:04:21 +0000 Subject: beebot-20230719220421 --- reports/beebot/file1_07-19-21-56.json | 263 ++++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 reports/beebot/file1_07-19-21-56.json diff --git a/reports/beebot/file1_07-19-21-56.json b/reports/beebot/file1_07-19-21-56.json new file mode 100644 index 000000000..78ea838a8 --- /dev/null +++ b/reports/beebot/file1_07-19-21-56.json @@ -0,0 +1,263 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-19-22:04", + "metrics": { + "run_time": "494.94 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "15.637 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "94.174 seconds" + }, + "reached_cutoff": true + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "20.195 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "25.044 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "64.425 seconds" + }, + "reached_cutoff": true + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "75.682 seconds" + }, + "reached_cutoff": true + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "60.342 seconds" + }, + "reached_cutoff": true + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.007 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": true, + "success_%": 100.0, + "run_time": "61.089 seconds" + }, + "reached_cutoff": true + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/beebot/workspace/result.txt'", + "success_%": 0.0, + "run_time": "78.158 seconds" + }, + "reached_cutoff": true + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 622e0a2d62e0cdee24bad71e844f3891cde60331 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Thu, 20 Jul 2023 08:19:09 +0000 Subject: smol-developer-20230720081909 --- reports/smol-developer/file8_07-20-08-18.json | 266 ++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 reports/smol-developer/file8_07-20-08-18.json diff --git a/reports/smol-developer/file8_07-20-08-18.json b/reports/smol-developer/file8_07-20-08-18.json new file mode 100644 index 000000000..01e7b79a1 --- /dev/null +++ b/reports/smol-developer/file8_07-20-08-18.json @@ -0,0 +1,266 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-20-08:19", + "metrics": { + "run_time": "48.44 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "8.826 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "9.169 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "18.189 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "10.634 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "1.403 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.002 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "generated" + } +} \ No newline at end of file -- cgit v1.2.3 From 5f795e203a893d0707812a6ec04dacd314a706eb Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Thu, 20 Jul 2023 08:20:13 +0000 Subject: gpt-engineer-20230720082013 --- reports/gpt-engineer/file7_07-20-08-18.json | 267 ++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 reports/gpt-engineer/file7_07-20-08-18.json diff --git a/reports/gpt-engineer/file7_07-20-08-18.json b/reports/gpt-engineer/file7_07-20-08-18.json new file mode 100644 index 000000000..ae1bcaf81 --- /dev/null +++ b/reports/gpt-engineer/file7_07-20-08-18.json @@ -0,0 +1,267 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-20-08:20", + "metrics": { + "run_time": "123.99 seconds", + "highest_difficulty": "interface: 1" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "65.136 seconds" + }, + "reached_cutoff": true + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'", + "success_%": 0.0, + "run_time": "57.021 seconds" + }, + "reached_cutoff": false + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "0.756 seconds" + }, + "reached_cutoff": false + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'", + "success_%": 0.0, + "run_time": "0.774 seconds" + }, + "reached_cutoff": false + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "projects/my-new-project/workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From ad05c9886d527805b61f58e61333128406915122 Mon Sep 17 00:00:00 2001 From: Auto-GPT-Bot Date: Thu, 20 Jul 2023 08:26:52 +0000 Subject: beebot-20230720082652 --- reports/beebot/file2_07-20-08-18.json | 264 ++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 reports/beebot/file2_07-20-08-18.json diff --git a/reports/beebot/file2_07-20-08-18.json b/reports/beebot/file2_07-20-08-18.json new file mode 100644 index 000000000..4d423a445 --- /dev/null +++ b/reports/beebot/file2_07-20-08-18.json @@ -0,0 +1,264 @@ +{ + "command": "agbenchmark start", + "completion_time": "2023-07-20-08:26", + "metrics": { + "run_time": "480.5 seconds", + "highest_difficulty": "advanced: 5" + }, + "tests": { + "TestWriteFile": { + "data_path": "agbenchmark/challenges/interface/write_file", + "is_regression": false, + "task": "Print the the capital of America to a .txt file", + "answer": "Washington", + "description": "Tests the writing to file", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "22.969 seconds" + }, + "reached_cutoff": false + }, + "TestBasicCodeGeneration": { + "data_path": "agbenchmark/challenges/code/d3_two_sum", + "is_regression": false, + "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].", + "answer": "The two_sum function coded properly.", + "description": "Tests ability for the agent to create the two_sum function.", + "metrics": { + "difficulty": "advanced", + "success": true, + "success_%": 100.0, + "run_time": "90.6 seconds" + }, + "reached_cutoff": true + }, + "TestSearch": { + "data_path": "agbenchmark/challenges/interface/search", + "is_regression": false, + "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file", + "answer": "This is a Heading\nThis is a paragraph.", + "description": "Tests if an llm can search", + "metrics": { + "difficulty": "interface", + "success": false, + "fail_reason": "assert 1 in []", + "success_%": 0.0, + "run_time": "62.713 seconds" + }, + "reached_cutoff": true + }, + "TestReadFile": { + "data_path": "agbenchmark/challenges/interface/read_file", + "is_regression": false, + "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt", + "answer": "random string Hello World!", + "description": "This reads the file quickly", + "metrics": { + "difficulty": "interface", + "success": true, + "success_%": 100.0, + "run_time": "60.053 seconds" + }, + "reached_cutoff": true + }, + "TestThreeSum": { + "data_path": "agbenchmark/challenges/code/d5_three_sum", + "is_regression": false, + "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].", + "answer": "The three_sum function coded properly.", + "description": "Tests ability for the agent to create the three_sum function.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "71.451 seconds" + }, + "reached_cutoff": true + }, + "TestBasicRetrieval": { + "data_path": "agbenchmark/challenges/retrieval/r1_book_price", + "is_regression": false, + "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.", + "answer": "\u00a325.89", + "description": "Specifies specific website to retrieve website from.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithGuidance": { + "data_path": "agbenchmark/challenges/code/d1_debug", + "is_regression": false, + "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it.", + "metrics": { + "difficulty": "basic", + "success": false, + "fail_reason": "assert 1 in [0.0]", + "success_%": 0.0, + "run_time": "79.582 seconds" + }, + "reached_cutoff": true + }, + "TestBasicMemory": { + "data_path": "agbenchmark/challenges/memory/m1_id", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "2314", + "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files", + "metrics": { + "difficulty": "basic", + "success": true, + "success_%": 100.0, + "run_time": "60.606 seconds" + }, + "reached_cutoff": true + }, + "TestRetrieval2.0": { + "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue", + "is_regression": false, + "task": "Write tesla's revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "A no guardrails search for info", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestDebugSimpleTypoWithoutGuidance": { + "data_path": "agbenchmark/challenges/code/d2_vague", + "is_regression": false, + "task": "Make test.py run without errors.", + "answer": "[0, 1] [2, 5] [0, 3]", + "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.007 seconds" + }, + "reached_cutoff": false + }, + "TestCreateSimpleWebServer": { + "data_path": "agbenchmark/challenges/code/d4_web_server", + "is_regression": false, + "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ", + "answer": "GET localhost:8079/health responds with a 200 OK", + "description": "Tests ability for the agent to build a simple web server locally", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleIds": { + "data_path": "agbenchmark/challenges/memory/m2_multiple", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/beebot/workspace/result.txt'", + "success_%": 0.0, + "run_time": "32.306 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.1": { + "data_path": "agbenchmark/challenges/retrieval/r2.1_specific", + "is_regression": false, + "task": "Write Tesla's precise revenue in 2022 into a .txt file.", + "answer": "It was $81.462 billion in 2022.", + "description": "This one checks the accuracy of the information over r2", + "metrics": { + "difficulty": "novice", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultipleWithNoise": { + "data_path": "agbenchmark/challenges/memory/m3_noise", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "3145\n3791\n9317\n9471", + "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval3": { + "data_path": "agbenchmark/challenges/retrieval/r3", + "is_regression": false, + "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions", + "description": "Tests ability to retrieve information.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRetrieval2.2": { + "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting", + "is_regression": false, + "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).", + "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.", + "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.", + "metrics": { + "difficulty": "intermediate", + "success": false, + "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + }, + "TestRememberMultiplePhrasesWithNoise": { + "data_path": "agbenchmark/challenges/memory/m4_phrases", + "is_regression": false, + "task": "Follow the instructions in the instructions_1.txt file", + "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall", + "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.", + "metrics": { + "difficulty": "advanced", + "success": false, + "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]", + "success_%": 0.0, + "run_time": "0.001 seconds" + }, + "reached_cutoff": false + } + }, + "config": { + "workspace": "workspace" + } +} \ No newline at end of file -- cgit v1.2.3 From 307644a8c5560d63e0eed588322ec65f709d67f6 Mon Sep 17 00:00:00 2001 From: ph-ausseil Date: Thu, 20 Jul 2023 16:42:39 +0200 Subject: runner.cli parsers set as a library (#5021) * INIT 1/2 * INIT 2/2 * LINT --------- Co-authored-by: James Collins --- autogpt/core/runner/cli_app/main.py | 53 ++++---------------------------- autogpt/core/runner/client_lib/parser.py | 45 +++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 47 deletions(-) create mode 100755 autogpt/core/runner/client_lib/parser.py diff --git a/autogpt/core/runner/cli_app/main.py b/autogpt/core/runner/cli_app/main.py index 60af24bec..e0d9689a5 100644 --- a/autogpt/core/runner/cli_app/main.py +++ b/autogpt/core/runner/cli_app/main.py @@ -2,6 +2,12 @@ import click from autogpt.core.agent import AgentSettings, SimpleAgent from autogpt.core.runner.client_lib.logging import get_client_logger +from autogpt.core.runner.client_lib.parser import ( + parse_ability_result, + parse_agent_name_and_goals, + parse_agent_plan, + parse_next_ability, +) async def run_auto_gpt(user_configuration: dict): @@ -61,50 +67,3 @@ async def run_auto_gpt(user_configuration: dict): ) ability_result = await agent.execute_next_ability(user_input) print(parse_ability_result(ability_result)) - - -def parse_agent_name_and_goals(name_and_goals: dict) -> str: - parsed_response = f"Agent Name: {name_and_goals['agent_name']}\n" - parsed_response += f"Agent Role: {name_and_goals['agent_role']}\n" - parsed_response += "Agent Goals:\n" - for i, goal in enumerate(name_and_goals["agent_goals"]): - parsed_response += f"{i+1}. {goal}\n" - return parsed_response - - -def parse_agent_plan(plan: dict) -> str: - parsed_response = f"Agent Plan:\n" - for i, task in enumerate(plan["task_list"]): - parsed_response += f"{i+1}. {task['objective']}\n" - parsed_response += f"Task type: {task['type']} " - parsed_response += f"Priority: {task['priority']}\n" - parsed_response += f"Ready Criteria:\n" - for j, criteria in enumerate(task["ready_criteria"]): - parsed_response += f" {j+1}. {criteria}\n" - parsed_response += f"Acceptance Criteria:\n" - for j, criteria in enumerate(task["acceptance_criteria"]): - parsed_response += f" {j+1}. {criteria}\n" - parsed_response += "\n" - - return parsed_response - - -def parse_next_ability(current_task, next_ability: dict) -> str: - parsed_response = f"Current Task: {current_task.objective}\n" - ability_args = ", ".join( - f"{k}={v}" for k, v in next_ability["ability_arguments"].items() - ) - parsed_response += f"Next Ability: {next_ability['next_ability']}({ability_args})\n" - parsed_response += f"Motivation: {next_ability['motivation']}\n" - parsed_response += f"Self-criticism: {next_ability['self_criticism']}\n" - parsed_response += f"Reasoning: {next_ability['reasoning']}\n" - return parsed_response - - -def parse_ability_result(ability_result) -> str: - parsed_response = f"Ability: {ability_result['ability_name']}\n" - parsed_response += f"Ability Arguments: {ability_result['ability_args']}\n" - parsed_response = f"Ability Result: {ability_result['success']}\n" - parsed_response += f"Message: {ability_result['message']}\n" - parsed_response += f"Data: {ability_result['new_knowledge']}\n" - return parsed_response diff --git a/autogpt/core/runner/client_lib/parser.py b/autogpt/core/runner/client_lib/parser.py new file mode 100755 index 000000000..9246ea82d --- /dev/null +++ b/autogpt/core/runner/client_lib/parser.py @@ -0,0 +1,45 @@ +def parse_agent_name_and_goals(name_and_goals: dict) -> str: + parsed_response = f"Agent Name: {name_and_goals['agent_name']}\n" + parsed_response += f"Agent Role: {name_and_goals['agent_role']}\n" + parsed_response += "Agent Goals:\n" + for i, goal in enumerate(name_and_goals["agent_goals"]): + parsed_response += f"{i+1}. {goal}\n" + return parsed_response + + +def parse_agent_plan(plan: dict) -> str: + parsed_response = f"Agent Plan:\n" + for i, task in enumerate(plan["task_list"]): + parsed_response += f"{i+1}. {task['objective']}\n" + parsed_response += f"Task type: {task['type']} " + parsed_response += f"Priority: {task['priority']}\n" + parsed_response += f"Ready Criteria:\n" + for j, criteria in enumerate(task["ready_criteria"]): + parsed_response += f" {j+1}. {criteria}\n" + parsed_response += f"Acceptance Criteria:\n" + for j, criteria in enumerate(task["acceptance_criteria"]): + parsed_response += f" {j+1}. {criteria}\n" + parsed_response += "\n" + + return parsed_response + + +def parse_next_ability(current_task, next_ability: dict) -> str: + parsed_response = f"Current Task: {current_task.objective}\n" + ability_args = ", ".join( + f"{k}={v}" for k, v in next_ability["ability_arguments"].items() + ) + parsed_response += f"Next Ability: {next_ability['next_ability']}({ability_args})\n" + parsed_response += f"Motivation: {next_ability['motivation']}\n" + parsed_response += f"Self-criticism: {next_ability['self_criticism']}\n" + parsed_response += f"Reasoning: {next_ability['reasoning']}\n" + return parsed_response + + +def parse_ability_result(ability_result) -> str: + parsed_response = f"Ability: {ability_result['ability_name']}\n" + parsed_response += f"Ability Arguments: {ability_result['ability_args']}\n" + parsed_response = f"Ability Result: {ability_result['success']}\n" + parsed_response += f"Message: {ability_result['message']}\n" + parsed_response += f"Data: {ability_result['new_knowledge']}\n" + return parsed_response -- cgit v1.2.3 From db95d4cb842ea1c7e7eea5d93e525c5b25127a5c Mon Sep 17 00:00:00 2001 From: Reinier van der Leer Date: Thu, 20 Jul 2023 17:34:49 +0200 Subject: Agent loop v2: Planning & Task Management (part 1: refactoring) (#4799) * Move rename module `agent` -> `agents` * WIP: abstract agent structure into base class and port Agent * Move command arg path sanitization to decorator * Add fallback token limit in llm.utils.create_chat_completion * Rebase `MessageHistory` class on `ChatSequence` class * Fix linting * Consolidate logging modules * Wham Bam Boom * Fix tests & linting complaints * Update Agent class docstring * Fix Agent import in autogpt.llm.providers.openai * Fix agent kwarg in test_execute_code.py * Fix benchmarks.py * Clean up lingering Agent(ai_name=...) initializations * Fix agent kwarg * Make sanitize_path_arg decorator more robust * Fix linting * Fix command enabling lambda's * Use relative paths in file ops logger * Fix test_execute_python_file_not_found * Fix Config model validation breaking on .plugins * Define validator for Config.plugins * Fix Config model issues * Fix agent iteration budget in testing * Fix declaration of context_while_think * Fix Agent.parse_and_process_response signature * Fix Agent cycle_budget usages * Fix budget checking in BaseAgent.__next__ * Fix cycle budget initialization * Fix function calling in BaseAgent.think() * Include functions in token length calculation * Fix Config errors * Add debug thing to patched_api_requestor to investigate HTTP 400 errors * If this works I'm gonna be sad * Fix BaseAgent cycle budget logic and document attributes * Document attributes on `Agent` * Fix import issues between Agent and MessageHistory * Improve typing * Extract application code from the agent (#4982) * Extract application code from the agent * Wrap interaction loop in a function and call in benchmarks * Forgot the important function call * Add docstrings and inline comments to run loop * Update typing and docstrings in agent * Docstring formatting * Separate prompt construction from on_before_think * Use `self.default_cycle_instruction` in `Agent.think()` * Fix formatting * hot fix the SIGINT handler (#4997) The signal handler in the autogpt/main.py doesn't work properly because of the clean_input(...) func. This commit remedies this issue. The issue is mentioned in https://github.com/Significant-Gravitas/Auto-GPT/pull/4799/files/3966cdfd694c2a80c0333823c3bc3da090f85ed3#r1264278776 * Update the sigint handler to be smart enough to actually work (#4999) * Update the sigint handler to be smart enough to actually work * Update autogpt/main.py Co-authored-by: Reinier van der Leer * Can still use context manager * Merge in upstream --------- Co-authored-by: Reinier van der Leer * Fix CI * Fix initial prompt construction * off by one error * allow exit/EXIT to shut down app * Remove dead code --------- Co-authored-by: collijk Co-authored-by: Cyrus <39694513+cyrus-hawk@users.noreply.github.com> --- autogpt/agents/__init__.py | 3 +- autogpt/agents/agent.py | 448 ++++++++------------- autogpt/agents/base.py | 318 +++++++++++++++ autogpt/json_utils/utilities.py | 26 +- autogpt/llm/__init__.py | 2 + autogpt/llm/chat.py | 203 ---------- autogpt/llm/providers/openai.py | 2 +- autogpt/main.py | 280 ++++++++++++- autogpt/memory/message_history.py | 51 ++- autogpt/setup.py | 2 +- autogpt/spinner.py | 30 +- autogpt/utils.py | 6 +- benchmarks.py | 8 +- docs/challenges/building_challenges.md | 4 - .../debug_code/test_debug_code_challenge_a.py | 2 +- tests/challenges/utils.py | 2 +- tests/conftest.py | 8 +- tests/integration/agent_factory.py | 3 - tests/integration/test_execute_code.py | 4 +- tests/unit/test_agent.py | 5 +- tests/unit/test_message_history.py | 10 +- tests/unit/test_spinner.py | 19 +- tests/unit/test_utils.py | 14 +- tests/vcr/__init__.py | 4 + 24 files changed, 860 insertions(+), 594 deletions(-) create mode 100644 autogpt/agents/base.py delete mode 100644 autogpt/llm/chat.py diff --git a/autogpt/agents/__init__.py b/autogpt/agents/__init__.py index a6df24ad7..94a5f42a5 100644 --- a/autogpt/agents/__init__.py +++ b/autogpt/agents/__init__.py @@ -1,3 +1,4 @@ from .agent import Agent +from .base import AgentThoughts, BaseAgent, CommandArgs, CommandName -__all__ = ["Agent"] +__all__ = ["BaseAgent", "Agent", "CommandName", "CommandArgs", "AgentThoughts"] diff --git a/autogpt/agents/agent.py b/autogpt/agents/agent.py index 316cc4d44..f3fee609c 100644 --- a/autogpt/agents/agent.py +++ b/autogpt/agents/agent.py @@ -1,315 +1,215 @@ +from __future__ import annotations + import json -import signal -import sys +import time from datetime import datetime from pathlib import Path +from typing import TYPE_CHECKING, Any, Optional -from colorama import Fore, Style +if TYPE_CHECKING: + from autogpt.config import AIConfig, Config + from autogpt.llm.base import ChatModelResponse, ChatSequence + from autogpt.memory.vector import VectorMemory + from autogpt.models.command_registry import CommandRegistry -from autogpt.config import Config -from autogpt.config.ai_config import AIConfig -from autogpt.json_utils.utilities import extract_json_from_response, validate_json -from autogpt.llm import ChatModelResponse -from autogpt.llm.chat import chat_with_ai -from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS +from autogpt.json_utils.utilities import extract_dict_from_response, validate_dict +from autogpt.llm.api_manager import ApiManager +from autogpt.llm.base import Message from autogpt.llm.utils import count_string_tokens -from autogpt.logs import ( +from autogpt.logs import logger +from autogpt.logs.log_cycle import ( FULL_MESSAGE_HISTORY_FILE_NAME, NEXT_ACTION_FILE_NAME, USER_INPUT_FILE_NAME, LogCycleHandler, - logger, - print_assistant_thoughts, - remove_ansi_escape, ) -from autogpt.memory.message_history import MessageHistory -from autogpt.memory.vector import VectorMemory -from autogpt.models.command_registry import CommandRegistry -from autogpt.speech import say_text -from autogpt.spinner import Spinner -from autogpt.utils import clean_input from autogpt.workspace import Workspace +from .base import AgentThoughts, BaseAgent, CommandArgs, CommandName -class Agent: - """Agent class for interacting with Auto-GPT. - - Attributes: - ai_name: The name of the agent. - memory: The memory object to use. - next_action_count: The number of actions to execute. - system_prompt: The system prompt is the initial prompt that defines everything - the AI needs to know to achieve its task successfully. - Currently, the dynamic and customizable information in the system prompt are - ai_name, description and goals. - - triggering_prompt: The last sentence the AI will see before answering. - For Auto-GPT, this prompt is: - Determine exactly one command to use, and respond using the format specified - above: - The triggering prompt is not part of the system prompt because between the - system prompt and the triggering - prompt we have contextual information that can distract the AI and make it - forget that its goal is to find the next task to achieve. - SYSTEM PROMPT - CONTEXTUAL INFORMATION (memory, previous conversations, anything relevant) - TRIGGERING PROMPT - - The triggering prompt reminds the AI about its short term meta task - (defining the next task) - """ + +class Agent(BaseAgent): + """Agent class for interacting with Auto-GPT.""" def __init__( self, - ai_name: str, - memory: VectorMemory, - next_action_count: int, - command_registry: CommandRegistry, ai_config: AIConfig, - system_prompt: str, + command_registry: CommandRegistry, + memory: VectorMemory, triggering_prompt: str, workspace_directory: str | Path, config: Config, + cycle_budget: Optional[int] = None, ): - self.ai_name = ai_name + super().__init__( + ai_config=ai_config, + command_registry=command_registry, + config=config, + default_cycle_instruction=triggering_prompt, + cycle_budget=cycle_budget, + ) + self.memory = memory - self.history = MessageHistory.for_model(config.smart_llm, agent=self) - self.next_action_count = next_action_count - self.command_registry = command_registry - self.config = config - self.ai_config = ai_config - self.system_prompt = system_prompt - self.triggering_prompt = triggering_prompt + """VectorMemoryProvider used to manage the agent's context (TODO)""" + self.workspace = Workspace(workspace_directory, config.restrict_to_workspace) + """Workspace that the agent has access to, e.g. for reading/writing files.""" + self.created_at = datetime.now().strftime("%Y%m%d_%H%M%S") - self.cycle_count = 0 + """Timestamp the agent was created; only used for structured debug logging.""" + self.log_cycle_handler = LogCycleHandler() - self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens - - def start_interaction_loop(self): - # Interaction Loop - self.cycle_count = 0 - command_name = None - arguments = None - user_input = "" - - # Signal handler for interrupting y -N - def signal_handler(signum, frame): - if self.next_action_count == 0: - sys.exit() - else: - print( - Fore.RED - + "Interrupt signal received. Stopping continuous command execution." - + Style.RESET_ALL - ) - self.next_action_count = 0 + """LogCycleHandler for structured debug logging.""" + + def construct_base_prompt(self, *args, **kwargs) -> ChatSequence: + if kwargs.get("prepend_messages") is None: + kwargs["prepend_messages"] = [] + + # Clock + kwargs["prepend_messages"].append( + Message("system", f"The current time and date is {time.strftime('%c')}"), + ) - signal.signal(signal.SIGINT, signal_handler) + # Add budget information (if any) to prompt + api_manager = ApiManager() + if api_manager.get_total_budget() > 0.0: + remaining_budget = ( + api_manager.get_total_budget() - api_manager.get_total_cost() + ) + if remaining_budget < 0: + remaining_budget = 0 + + budget_msg = Message( + "system", + f"Your remaining API budget is ${remaining_budget:.3f}" + + ( + " BUDGET EXCEEDED! SHUT DOWN!\n\n" + if remaining_budget == 0 + else " Budget very nearly exceeded! Shut down gracefully!\n\n" + if remaining_budget < 0.005 + else " Budget nearly exceeded. Finish up.\n\n" + if remaining_budget < 0.01 + else "" + ), + ) + logger.debug(budget_msg) + + if kwargs.get("append_messages") is None: + kwargs["append_messages"] = [] + kwargs["append_messages"].append(budget_msg) + + return super().construct_base_prompt(*args, **kwargs) + + def on_before_think(self, *args, **kwargs) -> ChatSequence: + prompt = super().on_before_think(*args, **kwargs) + + self.log_cycle_handler.log_count_within_cycle = 0 + self.log_cycle_handler.log_cycle( + self.ai_config.ai_name, + self.created_at, + self.cycle_count, + self.history.raw(), + FULL_MESSAGE_HISTORY_FILE_NAME, + ) + return prompt - while True: - # Discontinue if continuous limit is reached - self.cycle_count += 1 - self.log_cycle_handler.log_count_within_cycle = 0 + def execute( + self, + command_name: str | None, + command_args: dict[str, str] | None, + user_input: str | None, + ) -> str: + # Execute command + if command_name is not None and command_name.lower().startswith("error"): + result = f"Could not execute command: {command_name}{command_args}" + elif command_name == "human_feedback": + result = f"Human feedback: {user_input}" self.log_cycle_handler.log_cycle( self.ai_config.ai_name, self.created_at, self.cycle_count, - [m.raw() for m in self.history], - FULL_MESSAGE_HISTORY_FILE_NAME, + user_input, + USER_INPUT_FILE_NAME, ) - if ( - self.config.continuous_mode - and self.config.continuous_limit > 0 - and self.cycle_count > self.config.continuous_limit - ): - logger.typewriter_log( - "Continuous Limit Reached: ", - Fore.YELLOW, - f"{self.config.continuous_limit}", - ) - break - # Send message to AI, get response - with Spinner("Thinking... ", plain_output=self.config.plain_output): - assistant_reply = chat_with_ai( - self.config, - self, - self.system_prompt, - self.triggering_prompt, - self.smart_token_limit, - self.config.smart_llm, - ) - - try: - assistant_reply_json = extract_json_from_response( - assistant_reply.content - ) - validate_json(assistant_reply_json, self.config) - except json.JSONDecodeError as e: - logger.error(f"Exception while validating assistant reply JSON: {e}") - assistant_reply_json = {} + else: for plugin in self.config.plugins: - if not plugin.can_handle_post_planning(): + if not plugin.can_handle_pre_command(): continue - assistant_reply_json = plugin.post_planning(assistant_reply_json) - - # Print Assistant thoughts - if assistant_reply_json != {}: - # Get command name and arguments - try: - print_assistant_thoughts( - self.ai_name, assistant_reply_json, self.config - ) - command_name, arguments = extract_command( - assistant_reply_json, assistant_reply, self.config - ) - if self.config.speak_mode: - say_text(f"I want to execute {command_name}", self.config) - - except Exception as e: - logger.error("Error: \n", str(e)) - self.log_cycle_handler.log_cycle( - self.ai_config.ai_name, - self.created_at, - self.cycle_count, - assistant_reply_json, - NEXT_ACTION_FILE_NAME, + command_name, arguments = plugin.pre_command(command_name, command_args) + command_result = execute_command( + command_name=command_name, + arguments=command_args, + agent=self, ) + result = f"Command {command_name} returned: " f"{command_result}" - # First log new-line so user can differentiate sections better in console - logger.typewriter_log("\n") - logger.typewriter_log( - "NEXT ACTION: ", - Fore.CYAN, - f"COMMAND = {Fore.CYAN}{remove_ansi_escape(command_name)}{Style.RESET_ALL} " - f"ARGUMENTS = {Fore.CYAN}{arguments}{Style.RESET_ALL}", + result_tlength = count_string_tokens(str(command_result), self.llm.name) + memory_tlength = count_string_tokens( + str(self.history.summary_message()), self.llm.name ) + if result_tlength + memory_tlength > self.send_token_limit: + result = f"Failure: command {command_name} returned too much output. \ + Do not execute this command again with the same arguments." - if not self.config.continuous_mode and self.next_action_count == 0: - # ### GET USER AUTHORIZATION TO EXECUTE COMMAND ### - # Get key press: Prompt the user to press enter to continue or escape - # to exit - self.user_input = "" - logger.info( - f"Enter '{self.config.authorise_key}' to authorise command, " - f"'{self.config.authorise_key} -N' to run N continuous commands, " - f"'{self.config.exit_key}' to exit program, or enter feedback for " - f"{self.ai_name}..." - ) - while True: - if self.config.chat_messages_enabled: - console_input = clean_input( - self.config, "Waiting for your response..." - ) - else: - console_input = clean_input( - self.config, Fore.MAGENTA + "Input:" + Style.RESET_ALL - ) - if console_input.lower().strip() == self.config.authorise_key: - user_input = "GENERATE NEXT COMMAND JSON" - break - elif console_input.lower().strip() == "": - logger.warn("Invalid input format.") - continue - elif console_input.lower().startswith( - f"{self.config.authorise_key} -" - ): - try: - self.next_action_count = abs( - int(console_input.split(" ")[1]) - ) - user_input = "GENERATE NEXT COMMAND JSON" - except ValueError: - logger.warn( - f"Invalid input format. Please enter '{self.config.authorise_key} -n' " - "where n is the number of continuous tasks." -