From 0b899eb4cfce5084abec3c25342ddf2e097dc1ac Mon Sep 17 00:00:00 2001
From: Toran Bruce Richards <toran.richards@gmail.com>
Date: Thu, 6 Apr 2023 13:59:45 +0100
Subject: Initial commit

---
 .gitignore | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 LICENSE    |  21 ++++++++++
 README.md  |   2 +
 3 files changed, 152 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..b6e47617d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,129 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..601935b85
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Toran Bruce Richards
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..0120d4fca
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# Auto-GPT-Benchmarks
+A set of standardised benchmarks to assess the performance of Auto-GPTs.
-- 
cgit v1.2.3


From 89081d942c077190d9aa89b0b88cbcc03162da2c Mon Sep 17 00:00:00 2001
From: douglas <schonholtzd@gmail.com>
Date: Mon, 17 Apr 2023 17:22:31 -0400
Subject: First commit for AutoGPT Benchmarks

---
 .gitmodules                                        |  3 +
 README.md                                          | 38 ++++++++++
 auto_gpt_benchmarking/Auto-GPT                     |  1 +
 auto_gpt_benchmarking/AutoGPTAgent.py              | 88 ++++++++++++++++++++++
 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml |  8 ++
 auto_gpt_benchmarking/CompletionFn.py              | 27 +++++++
 auto_gpt_benchmarking/LangChainCompletions.py      | 34 +++++++++
 auto_gpt_benchmarking/__init__.py                  |  0
 .../completion_fns/auto_gpt_completion_fn.yaml     |  2 +
 auto_gpt_benchmarking/main.py                      |  4 +
 requirements.txt                                   |  1 +
 11 files changed, 206 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 auto_gpt_benchmarking/Auto-GPT
 create mode 100644 auto_gpt_benchmarking/AutoGPTAgent.py
 create mode 100644 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
 create mode 100644 auto_gpt_benchmarking/CompletionFn.py
 create mode 100644 auto_gpt_benchmarking/LangChainCompletions.py
 create mode 100644 auto_gpt_benchmarking/__init__.py
 create mode 100644 auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
 create mode 100644 auto_gpt_benchmarking/main.py
 create mode 100644 requirements.txt

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..d293ba9c4
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "Auto-GPT"]
+	path = auto_gpt_benchmarking/Auto-GPT
+	url = https://github.com/Significant-Gravitas/Auto-GPT.git
diff --git a/README.md b/README.md
index 0120d4fca..75db145a2 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,40 @@
 # Auto-GPT-Benchmarks
 A set of standardised benchmarks to assess the performance of Auto-GPTs.
+
+# What is next?
+
+- [ ] Build longer form tasks, (code fix backed by testing)
+- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
+- [ ] Switch to a ubuntu container so it can do more things (git, bash, etc)
+- [ ] Lower priority, but put this in a webserver backend so we have a good API
+- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
+- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
+
+
+## Understanding OpenAI Evals
+
+The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
+
+The basic idea is this though:
+1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
+2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
+3. Run the evals against the completion function.
+
+Then you can make more also, yaml defined evals and run them against the completion function as needed.
+
+### Completions Functions
+
+See our yaml file in `completion_fns` dir for the registration of the completion function.
+See our completion function itself in CompletionFn.py
+That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
+
+
+# RANDOM SHIT
+
+You must add the auto_gpt_bencchmarking dir to the python path
+Do this with a path file in your venv. OpenAI evals needs to import it. 
+
+I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: 
+`/home/douglas/AGI/Auto-GPT-Benchmarks-fork`
+
+
diff --git a/auto_gpt_benchmarking/Auto-GPT b/auto_gpt_benchmarking/Auto-GPT
new file mode 160000
index 000000000..97d62cc16
--- /dev/null
+++ b/auto_gpt_benchmarking/Auto-GPT
@@ -0,0 +1 @@
+Subproject commit 97d62cc16bf45fcd406efeb33d042ebd58c24670
diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py
new file mode 100644
index 000000000..f24b150b4
--- /dev/null
+++ b/auto_gpt_benchmarking/AutoGPTAgent.py
@@ -0,0 +1,88 @@
+"""
+This instantiates an AutoGPT agent who is capable of handling any task.
+It is designed to pass benchmarks as effectively as possible.
+
+Loads in the ai_settings.yaml file to get the AI's name, role, and goals.
+Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation.
+
+The model is instantiated with a prompt from the AutoGPT completion function.
+
+Eventualy we will also save and log all of the associated output and thinking for the model as well
+"""
+from pathlib import Path
+import os
+
+
+class AutoGPTAgent:
+    """
+    A class object that contains the configuration information for the AI
+    The init function takes an evaluation prompt.
+    It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo.
+    It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt
+    It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder.
+    If the model has used more than 50,000 tokens, it kills the model.
+    If the model has used less than 50,000 tokens, it returns the output.txt file.
+    """
+    def _clean_up_workspace(self):
+        """
+        Cleans up the workspace by deleting the prompt.txt and output.txt files.
+        :return:
+        """
+        # check if the files are there and delete them if they are
+        if self.prompt_file.exists():
+            self.prompt_file.unlink()
+        if self.output_file.exists():
+            self.output_file.unlink()
+
+    def _copy_ai_settings(self):
+        self.ai_settings_dest.write_text(self.ai_settings_file.read_text())
+
+    def _copy_prompt(self):
+        self.prompt_file.write_text(self.prompt)
+
+    def _start_agent(self):
+        """
+        This starts the agent in the docker container.
+        This assumes you have the docker image built with:
+        docker build -t autogpt .
+        In the dockerfile in the Auto-GPT repo.
+        You also must set up the .env file in the Auto-GPT repo.
+        :return:
+        """
+        env_file = self.auto_gpt_path / ".env"
+        # run it in continuous mode and skip re-prompts
+        os.system(f"docker run -it --env-file={env_file} -v {self.auto_workspace}:/home/appuser/auto_gpt_workspace -v {self.auto_gpt_path}/autogpt:/home/appuser/autogpt autogpt --continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'")
+
+    def _poll_for_output(self):
+        """
+        This polls the output file to see if the model has finished.
+        :return:
+        """
+        while True:
+            if self.output_file.exists():
+                return self.output_file.read_text()
+
+    def __init__(self, prompt):
+        self.auto_gpt_path = Path(__file__).parent / "Auto-GPT"
+        self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace"
+        self.prompt_file = self.auto_workspace / "prompt.txt"
+        self.output_file = self.auto_workspace / "output.txt"
+        self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml"
+        self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml"
+        self.prompt = prompt
+        self._clean_up_workspace()
+        self._copy_ai_settings()
+        self._copy_prompt()
+
+    def start(self):
+        self._start_agent()
+        answer = self._poll_for_output()
+        print('about to do clean up')
+        print(answer)
+        self._clean_up_workspace()
+        print('did clean up')
+        return answer
+
+
+
+
diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
new file mode 100644
index 000000000..b7cc573d5
--- /dev/null
+++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
@@ -0,0 +1,8 @@
+ai_goals:
+- Evaluate the prompt in `prompt.txt`
+- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided.
+- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer.
+- Save your work in the `output.txt` file, the second you do this, exit the program.
+- Exit the program when you are done.
+ai_name: EvaluationAgent
+ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible
diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py
new file mode 100644
index 000000000..9bb4bb32b
--- /dev/null
+++ b/auto_gpt_benchmarking/CompletionFn.py
@@ -0,0 +1,27 @@
+import importlib
+from typing import Optional
+from evals.api import CompletionFn, CompletionResult
+
+from evals.prompt.base import CompletionPrompt
+from evals.record import record_sampling
+from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent
+
+
+class AutoGPTCompletionResult(CompletionResult):
+    def __init__(self, response) -> None:
+        self.response = response
+
+    def get_completions(self) -> list[str]:
+        return [self.response.strip()]
+
+
+class AutoGPTCompletionFn(CompletionFn):
+    def __init__(self, **kwargs) -> None:
+        pass
+
+    def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult:
+        prompt = CompletionPrompt(prompt).to_formatted_prompt()
+        agent = AutoGPTAgent(prompt)
+        response = agent.start()
+        record_sampling(prompt=prompt, sampled=response)
+        return AutoGPTCompletionResult(response)
\ No newline at end of file
diff --git a/auto_gpt_benchmarking/LangChainCompletions.py b/auto_gpt_benchmarking/LangChainCompletions.py
new file mode 100644
index 000000000..17f52bfa1
--- /dev/null
+++ b/auto_gpt_benchmarking/LangChainCompletions.py
@@ -0,0 +1,34 @@
+import importlib
+from typing import Optional
+from evals.api import CompletionFn, CompletionResult
+
+from langchain.llms import BaseLLM
+
+from evals.prompt.base import CompletionPrompt
+from evals.record import record_sampling
+
+
+class LangChainLLMCompletionResult(CompletionResult):
+    def __init__(self, response) -> None:
+        self.response = response
+
+    def get_completions(self) -> list[str]:
+        return [self.response.strip()]
+
+
+class LangChainLLMCompletionFn(CompletionFn):
+    def __init__(self, llm: str, llm_kwargs: Optional[dict] = {}, **kwargs) -> None:
+        # Import and resolve self.llm to an instance of llm argument here, assuming it's always a subclass of BaseLLM
+        module = importlib.import_module("langchain.llms")
+        LLMClass = getattr(module, llm)
+
+        if issubclass(LLMClass, BaseLLM):
+            self.llm = LLMClass(**llm_kwargs)
+        else:
+            raise ValueError(f"{llm} is not a subclass of BaseLLM")
+
+    def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult:
+        prompt = CompletionPrompt(prompt).to_formatted_prompt()
+        response = self.llm(prompt)
+        record_sampling(prompt=prompt, sampled=response)
+        return LangChainLLMCompletionResult(response)
diff --git a/auto_gpt_benchmarking/__init__.py b/auto_gpt_benchmarking/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
new file mode 100644
index 000000000..d6a55a29b
--- /dev/null
+++ b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
@@ -0,0 +1,2 @@
+auto_gpt_completion_fn:
+  class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn
\ No newline at end of file
diff --git a/auto_gpt_benchmarking/main.py b/auto_gpt_benchmarking/main.py
new file mode 100644
index 000000000..f0303f1e7
--- /dev/null
+++ b/auto_gpt_benchmarking/main.py
@@ -0,0 +1,4 @@
+"""
+To run auto-gpt we need to run the following command:
+
+"""
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..a59bcbdd3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+evals
\ No newline at end of file
-- 
cgit v1.2.3


From 7212c3876d9c23c52893788462ac744e80853329 Mon Sep 17 00:00:00 2001
From: douglas <schonholtzd@gmail.com>
Date: Mon, 17 Apr 2023 17:34:45 -0400
Subject: Cleanup

---
 README.md                     | 1 +
 auto_gpt_benchmarking/main.py | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)
 delete mode 100644 auto_gpt_benchmarking/main.py

diff --git a/README.md b/README.md
index 75db145a2..db3c5e3ac 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs.
 - [ ] Lower priority, but put this in a webserver backend so we have a good API
 - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
 - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
+- [ ] Figure our how the OpenAI Evals results are saved...
 
 
 ## Understanding OpenAI Evals
diff --git a/auto_gpt_benchmarking/main.py b/auto_gpt_benchmarking/main.py
deleted file mode 100644
index f0303f1e7..000000000
--- a/auto_gpt_benchmarking/main.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""
-To run auto-gpt we need to run the following command:
-
-"""
\ No newline at end of file
-- 
cgit v1.2.3


From 59ff485253225dc7902cc506369ded9457dfed64 Mon Sep 17 00:00:00 2001
From: douglas <schonholtzd@gmail.com>
Date: Mon, 17 Apr 2023 18:14:09 -0400
Subject: Prompt engineering fixes

---
 README.md                                          | 39 ++++++++++++++++++++--
 auto_gpt_benchmarking/AutoGPTAgent.py              |  3 ++
 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml |  6 ++--
 3 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index db3c5e3ac..b8f09a94c 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs.
 - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
 - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
 - [ ] Figure our how the OpenAI Evals results are saved...
+- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't.
+- [ ] Make the file logger/duplicate op checker more robust. It's not great right now.
 
 
 ## Understanding OpenAI Evals
@@ -30,12 +32,43 @@ See our completion function itself in CompletionFn.py
 That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
 
 
-# RANDOM SHIT
+## Setup
 
 You must add the auto_gpt_bencchmarking dir to the python path
 Do this with a path file in your venv. OpenAI evals needs to import it. 
 
-I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: 
-`/home/douglas/AGI/Auto-GPT-Benchmarks-fork`
+Create a venv with
 
+`python3.9 -m venv venv`
+
+Activate it with
+
+`source venv/bin/activate`
+
+Add a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: 
+`/PATH/TO/REPO/Auto-GPT-Benchmarks-fork`
+
+This is because evals tries to import it directly.
+
+Install the requirements with
+
+`pip install -r requirements.txt`
+
+## Running the tests
+
+EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-match --registry_path $PWD/auto_gpt_benchmarking
+
+
+# Example final output:
+
+~/AGI/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl
+{"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}}
+{"final_report": {"accuracy": 0.3333333333333333}}
+{"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"}
+{"run_id": "230417220821DPM75QNS", "event_id": 1, "sample_id": "test-match.s1.2", "type": "match", "data": {"correct": false, "expected": "15", "picked": null, "sampled": "OpenAI was founded in 2015.2015", "options": ["15"]}, "created_by": "", "created_at": "2023-04-17 22:10:13.127550+00:00"}
+{"run_id": "230417220821DPM75QNS", "event_id": 2, "sample_id": "test-match.s1.1", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: The first US president was \nAssistant: ", "sampled": "George Washington"}, "created_by": "", "created_at": "2023-04-17 22:11:17.761693+00:00"}
+{"run_id": "230417220821DPM75QNS", "event_id": 3, "sample_id": "test-match.s1.1", "type": "match", "data": {"correct": true, "expected": "George Washington", "picked": "George Washington", "sampled": "George Washington", "options": ["George Washington"]}, "created_by": "", "created_at": "2023-04-17 22:11:17.761739+00:00"}
+{"run_id": "230417220821DPM75QNS", "event_id": 4, "sample_id": "test-match.s1.0", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: Once upon a \nAssistant: ", "sampled": "Once upon a time"}, "created_by": "", "created_at": "2023-04-17 22:12:04.691026+00:00"}
+{"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"}
+(venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ 
 
diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py
index f24b150b4..097311c73 100644
--- a/auto_gpt_benchmarking/AutoGPTAgent.py
+++ b/auto_gpt_benchmarking/AutoGPTAgent.py
@@ -33,6 +33,8 @@ class AutoGPTAgent:
             self.prompt_file.unlink()
         if self.output_file.exists():
             self.output_file.unlink()
+        if self.file_logger.exists():
+            self.file_logger.unlink()
 
     def _copy_ai_settings(self):
         self.ai_settings_dest.write_text(self.ai_settings_file.read_text())
@@ -67,6 +69,7 @@ class AutoGPTAgent:
         self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace"
         self.prompt_file = self.auto_workspace / "prompt.txt"
         self.output_file = self.auto_workspace / "output.txt"
+        self.file_logger = self.auto_workspace / "file_logger.txt"
         self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml"
         self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml"
         self.prompt = prompt
diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
index b7cc573d5..ec995a666 100644
--- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
+++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
@@ -1,8 +1,6 @@
 ai_goals:
-- Evaluate the prompt in `prompt.txt`
-- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided.
+- Evaluate the prompt in `prompt.txt` and find the best answer in the format provided.
 - Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer.
-- Save your work in the `output.txt` file, the second you do this, exit the program.
-- Exit the program when you are done.
+- Save the final answer and output to the `output.txt` file, the only file you should write to then immediately exit the program.
 ai_name: EvaluationAgent
 ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible
-- 
cgit v1.2.3


From 3b0091c2314f61e71246c1609bb1fb0607c85b58 Mon Sep 17 00:00:00 2001
From: Ambuj Pawar <pawar.ambuj@gmail.com>
Date: Tue, 18 Apr 2023 09:25:25 +0200
Subject: Typo in README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b8f09a94c..871f17b76 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ That points to the AutoGPT model we want to test which is spun up dynamically in
 
 ## Setup
 
-You must add the auto_gpt_bencchmarking dir to the python path
+You must add the auto_gpt_benchmarking dir to the python path
 Do this with a path file in your venv. OpenAI evals needs to import it. 
 
 Create a venv with
-- 
cgit v1.2.3


From 2fbb03dc6c1df3ca1fae2549c3aa9c0a1d86aea6 Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Tue, 18 Apr 2023 10:27:47 -0400
Subject: Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 871f17b76..123c87e8b 100644
--- a/README.md
+++ b/README.md
@@ -18,12 +18,12 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs.
 
 The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
 
-The basic idea is this though:
+The basic idea is this:
 1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
 2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
 3. Run the evals against the completion function.
 
-Then you can make more also, yaml defined evals and run them against the completion function as needed.
+Then you can make more yaml defined evals and run them against the completion function as needed.
 
 ### Completions Functions
 
@@ -61,7 +61,7 @@ EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-mat
 
 # Example final output:
 
-~/AGI/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl
+/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl
 {"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}}
 {"final_report": {"accuracy": 0.3333333333333333}}
 {"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"}
-- 
cgit v1.2.3


From dad4804b4e53f4aab4f2615345d4638719399da1 Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Tue, 18 Apr 2023 10:29:05 -0400
Subject: Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 123c87e8b..f3b54648b 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs.
 - [ ] Build longer form tasks, (code fix backed by testing)
 - [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
 - [ ] Switch to a ubuntu container so it can do more things (git, bash, etc)
-- [ ] Lower priority, but put this in a webserver backend so we have a good API
+- [ ] Lower priority, but put this in a webserver backend so we have a good API rather than doing container and file management for our interface between evals and our agent.
 - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
 - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
 - [ ] Figure our how the OpenAI Evals results are saved...
-- 
cgit v1.2.3


From 486c7e3a5ea1a92472945ae6d42a855bd4191239 Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Tue, 18 Apr 2023 11:10:24 -0400
Subject: Update README.md

Adding set up info
---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index f3b54648b..26aec3d3a 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,14 @@ Install the requirements with
 
 `pip install -r requirements.txt`
 
+You must have a docker container built corresponding to the submodule below or the docker run command starting the agent will fail.
+
+Cd into the AutoGPT submodule and build/tag the dockerfile so the agent can be instantiated.
+`cd auto_gpt_benchmarks/Auto-GPT`
+
+Build the container so we can run it procedurally!
+`docker build -t autogpt .`
+
 ## Running the tests
 
 EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-match --registry_path $PWD/auto_gpt_benchmarking
-- 
cgit v1.2.3


From f00ced6612896c0489eb83017777bc3e3652cc33 Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Tue, 18 Apr 2023 11:59:42 -0400
Subject: Update README.md

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 26aec3d3a..52eb94454 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,8 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs.
 - [ ] Lower priority, but put this in a webserver backend so we have a good API rather than doing container and file management for our interface between evals and our agent.
 - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
 - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
-- [ ] Figure our how the OpenAI Evals results are saved...
+- [ ] Copy the OpenAI Eval files from the tmp file they are saved to somewhere we can track the results
 - [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't.
-- [ ] Make the file logger/duplicate op checker more robust. It's not great right now.
 
 
 ## Understanding OpenAI Evals
-- 
cgit v1.2.3


From 625d6e72ecc1ba0336199e4cefbb41d409acf2d1 Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Thu, 20 Apr 2023 15:41:29 -0400
Subject: Remove the submodule, reference OpenAI directly rather than running
 it on the command line, fix logging (#16)

* Removed submodule, refactor, docker on pip, async docker logging, running our own tool on CLI rather than OpenAIs
---
 .gitignore                                         |   2 +
 .gitmodules                                        |   3 -
 README.md                                          | 117 +++++++++++------
 auto_gpt_benchmarking/Auto-GPT                     |   1 -
 auto_gpt_benchmarking/AutoGPTAgent.py              |  80 ++++++++++--
 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml |   2 +-
 auto_gpt_benchmarking/CompletionFn.py              |  21 ++-
 auto_gpt_benchmarking/Evaluator.py                 |  61 +++++++++
 auto_gpt_benchmarking/LangChainCompletions.py      |  34 -----
 auto_gpt_benchmarking/__main__.py                  | 144 +++++++++++++++++++++
 .../completion_fns/auto_gpt_completion_fn.yaml     |   4 +-
 requirements.txt                                   |  82 +++++++++++-
 12 files changed, 452 insertions(+), 99 deletions(-)
 delete mode 160000 auto_gpt_benchmarking/Auto-GPT
 create mode 100644 auto_gpt_benchmarking/Evaluator.py
 delete mode 100644 auto_gpt_benchmarking/LangChainCompletions.py
 create mode 100644 auto_gpt_benchmarking/__main__.py

diff --git a/.gitignore b/.gitignore
index b6e47617d..e68877ae9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+/data
diff --git a/.gitmodules b/.gitmodules
index d293ba9c4..e69de29bb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "Auto-GPT"]
-	path = auto_gpt_benchmarking/Auto-GPT
-	url = https://github.com/Significant-Gravitas/Auto-GPT.git
diff --git a/README.md b/README.md
index 52eb94454..5a75d5a33 100644
--- a/README.md
+++ b/README.md
@@ -1,69 +1,97 @@
 # Auto-GPT-Benchmarks
-A set of standardised benchmarks to assess the performance of Auto-GPTs.
+A set of standardised benchmarks to assess the performance of Auto-GPT.
+This currently uses the OpenAI Evals framework to run the benchmarks.
 
-# What is next?
+## Setup
 
-- [ ] Build longer form tasks, (code fix backed by testing)
-- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
-- [ ] Switch to a ubuntu container so it can do more things (git, bash, etc)
-- [ ] Lower priority, but put this in a webserver backend so we have a good API rather than doing container and file management for our interface between evals and our agent.
-- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
-- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
-- [ ] Copy the OpenAI Eval files from the tmp file they are saved to somewhere we can track the results
-- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't.
+You must add the auto_gpt_benchmarking dir to the python path
+Do this with a path file in your venv. OpenAI evals needs to import it.
 
+These instructions currently assume ubuntuy 22.04.
+They should be fairly adaptable to the windows/MacOS equivalents. Please submit a PR if you would like to see your OS
+documented.
 
-## Understanding OpenAI Evals
+Clone the repo with:
 
-The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
+    `git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git`
+    `cd Auto-GPT-Benchmarks`
 
-The basic idea is this:
-1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
-2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
-3. Run the evals against the completion function.
+Create a venv with
 
-Then you can make more yaml defined evals and run them against the completion function as needed.
+    `python3.9 -m venv venv`
 
-### Completions Functions
 
-See our yaml file in `completion_fns` dir for the registration of the completion function.
-See our completion function itself in CompletionFn.py
-That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
+Activate it with
 
+    `source venv/bin/activate`
 
-## Setup
+Install the requirements with:
 
-You must add the auto_gpt_benchmarking dir to the python path
-Do this with a path file in your venv. OpenAI evals needs to import it. 
+    `pip install -r requirements.txt`
 
-Create a venv with
+If you haven't already clone the AutoGPT repo somewhere else on your machine.
+DO NOT CLONE IT INTO A SUBDIR OF THIS REPO.
 
-`python3.9 -m venv venv`
+    `cd somewhere/else`
+    `git clone git@github.com:Significant-Gravitas/Auto-GPT.git`
 
-Activate it with
+You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at:
+
+    `Auto-GPT/.env`
 
-`source venv/bin/activate`
+Finally, we assume you have a docker container built from the Dockerfile in the Auto-GPT repo.
 
-Add a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: 
-`/PATH/TO/REPO/Auto-GPT-Benchmarks-fork`
+Build this with:
 
-This is because evals tries to import it directly.
+    `cd Auto-GPT`
+    `docker build -t autogpt .`
 
-Install the requirements with
+If you want to run with redis as your memory system, you can stand up a redis image in the AutoGPT repo with
+    
+    `docker compose up`
 
-`pip install -r requirements.txt`
+Then you will need to adjust some variables in your .env file to use the redis memory backend. 
+See the AutoGPT docs on how to do that.
 
-You must have a docker container built corresponding to the submodule below or the docker run command starting the agent will fail.
+Run your first eval with:
 
-Cd into the AutoGPT submodule and build/tag the dockerfile so the agent can be instantiated.
-`cd auto_gpt_benchmarks/Auto-GPT`
+    `cd Auto-GPT-Benchmarks`
+    `python3 -m auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT`
 
-Build the container so we can run it procedurally!
-`docker build -t autogpt .`
+You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in 
 
-## Running the tests
+    `auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml`.
 
-EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-match --registry_path $PWD/auto_gpt_benchmarking
+To see a full list of available flags you can use run `python3 -m auto_gpt_benchmarking --help`
+Some of these are inherited from the openAI evals framework and do not work quite as intended as they are not applicable
+to this use case.
+
+This saves a file in `Auto-GPT-Benchmarks/data/records.jsonl`
+This file is currently a default that is configurable with --record_path flag. You will have to specify the fully
+qualified path.
+
+## Currently Supported Benchmarks:
+From OpenAI Evals
+- [x] test-match
+- [x] test-fuzzy-match
+- [ ] Everything else they have...
+
+## Understanding OpenAI Evals
+
+The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
+
+The basic idea is this though:
+1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
+2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
+3. Run the evals against the completion function.
+
+Then you can make more also, yaml defined evals and run them against the completion function as needed.
+
+### Completions Functions
+
+See our yaml file in `completion_fns` dir for the registration of the completion function.
+See our completion function itself in CompletionFn.py
+That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
 
 
 # Example final output:
@@ -79,3 +107,12 @@ EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-mat
 {"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"}
 (venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ 
 
+# What is next?
+
+- [ ] Run the rest of the OpenAI Evals Especially the modelgraded ones
+- [ ] Build longer form tasks, (code fix backed by testing)
+- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
+- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
+- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
+- [ ] Figure our how the OpenAI Evals results are saved...
+- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't.
diff --git a/auto_gpt_benchmarking/Auto-GPT b/auto_gpt_benchmarking/Auto-GPT
deleted file mode 160000
index 97d62cc16..000000000
--- a/auto_gpt_benchmarking/Auto-GPT
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 97d62cc16bf45fcd406efeb33d042ebd58c24670
diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py
index 097311c73..63cebf1cb 100644
--- a/auto_gpt_benchmarking/AutoGPTAgent.py
+++ b/auto_gpt_benchmarking/AutoGPTAgent.py
@@ -10,7 +10,9 @@ The model is instantiated with a prompt from the AutoGPT completion function.
 Eventualy we will also save and log all of the associated output and thinking for the model as well
 """
 from pathlib import Path
-import os
+import docker
+import asyncio
+import aiodocker
 
 
 class AutoGPTAgent:
@@ -36,12 +38,34 @@ class AutoGPTAgent:
         if self.file_logger.exists():
             self.file_logger.unlink()
 
-    def _copy_ai_settings(self):
+    def _copy_ai_settings(self) -> None:
         self.ai_settings_dest.write_text(self.ai_settings_file.read_text())
 
-    def _copy_prompt(self):
+    def _copy_prompt(self) -> None:
         self.prompt_file.write_text(self.prompt)
 
+    async def _stream_logs(self, container: aiodocker.containers.DockerContainer) -> None:
+        try:
+            async for line in container.log(stdout=True, stderr=True, follow=True, tail="all"):
+                print(line.strip())
+                await asyncio.sleep(1)
+        except aiodocker.exceptions.DockerError as e:
+            # Handle Docker errors (e.g., container is killed or removed)
+            print('Docker error: {}'.format(e))
+
+    async def _run_stream_logs(self) -> None:
+        """
+        This grabs the docker containers id and streams the logs to the console with aiodocker.
+        :return: None
+        """
+        async with aiodocker.Docker() as docker_client:
+            try:
+                container = docker_client.containers.container(self.container.id)
+                await self._stream_logs(container)
+            except aiodocker.exceptions.DockerError as e:
+                # Handle cases when the container is not found
+                print('Container not found: {}'.format(e))
+
     def _start_agent(self):
         """
         This starts the agent in the docker container.
@@ -51,9 +75,26 @@ class AutoGPTAgent:
         You also must set up the .env file in the Auto-GPT repo.
         :return:
         """
+        client = docker.from_env()
         env_file = self.auto_gpt_path / ".env"
-        # run it in continuous mode and skip re-prompts
-        os.system(f"docker run -it --env-file={env_file} -v {self.auto_workspace}:/home/appuser/auto_gpt_workspace -v {self.auto_gpt_path}/autogpt:/home/appuser/autogpt autogpt --continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'")
+        envs = [
+            f"{line.strip()}" for line in open(
+                env_file
+            ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n"]
+
+        self.container = client.containers.run(
+            image="autogpt",
+            command="--continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'",
+            environment=envs,
+            volumes={
+                self.auto_workspace: {"bind": "/home/appuser/auto_gpt_workspace", "mode": "rw"},
+                f"{self.auto_gpt_path}/autogpt": {"bind": "/home/appuser/autogpt", "mode": "rw"},
+            },
+            stdin_open=True,
+            tty=True,
+            detach=True
+        )
+        asyncio.run(self._run_stream_logs())
 
     def _poll_for_output(self):
         """
@@ -64,8 +105,8 @@ class AutoGPTAgent:
             if self.output_file.exists():
                 return self.output_file.read_text()
 
-    def __init__(self, prompt):
-        self.auto_gpt_path = Path(__file__).parent / "Auto-GPT"
+    def __init__(self, prompt, auto_gpt_path: str):
+        self.auto_gpt_path = Path(auto_gpt_path)
         self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace"
         self.prompt_file = self.auto_workspace / "prompt.txt"
         self.output_file = self.auto_workspace / "output.txt"
@@ -76,16 +117,33 @@ class AutoGPTAgent:
         self._clean_up_workspace()
         self._copy_ai_settings()
         self._copy_prompt()
+        self.container = None
+        self.killing = False
+        self.logging_task = None
 
     def start(self):
         self._start_agent()
         answer = self._poll_for_output()
-        print('about to do clean up')
-        print(answer)
-        self._clean_up_workspace()
-        print('did clean up')
+        print(f"Prompt was: {self.prompt}, Answer was: {answer}")
+        self.kill()
         return answer
 
+    def kill(self):
+        if self.killing:
+            return
+        self.killing = True
+        self._clean_up_workspace()
+        if self.container:
+            # kill the container
+            try:
+                self.container.kill()
+                self.container.remove()
+            except docker.errors.APIError:
+                print('Couldn\'t find container to kill. Assuming container successfully killed itself.')
+            if self.logging_task:
+                self.logging_task.cancel()
+        self.killing = False
+
 
 
diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
index ec995a666..ab6caaed0 100644
--- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
+++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
@@ -1,6 +1,6 @@
 ai_goals:
 - Evaluate the prompt in `prompt.txt` and find the best answer in the format provided.
 - Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer.
-- Save the final answer and output to the `output.txt` file, the only file you should write to then immediately exit the program.
+- Save the final answer and output to the `output.txt` file, the only file you should write to, then immediately exit the program because you are done.
 ai_name: EvaluationAgent
 ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible
diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py
index 9bb4bb32b..f82ede85c 100644
--- a/auto_gpt_benchmarking/CompletionFn.py
+++ b/auto_gpt_benchmarking/CompletionFn.py
@@ -1,5 +1,3 @@
-import importlib
-from typing import Optional
 from evals.api import CompletionFn, CompletionResult
 
 from evals.prompt.base import CompletionPrompt
@@ -16,12 +14,21 @@ class AutoGPTCompletionResult(CompletionResult):
 
 
 class AutoGPTCompletionFn(CompletionFn):
-    def __init__(self, **kwargs) -> None:
-        pass
+
+    def __init__(self, auto_gpt_path, **kwargs) -> None:
+        self.auto_gpt_path = auto_gpt_path
+        self.agent = None
 
     def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult:
         prompt = CompletionPrompt(prompt).to_formatted_prompt()
-        agent = AutoGPTAgent(prompt)
-        response = agent.start()
+        self.kill_agent()
+        self.agent = AutoGPTAgent(prompt, self.auto_gpt_path)
+        response = self.agent.start()
         record_sampling(prompt=prompt, sampled=response)
-        return AutoGPTCompletionResult(response)
\ No newline at end of file
+        return AutoGPTCompletionResult(response)
+
+    def kill_agent(self):
+        if self.agent:
+            self.agent.kill()
+
+
diff --git a/auto_gpt_benchmarking/Evaluator.py b/auto_gpt_benchmarking/Evaluator.py
new file mode 100644
index 000000000..4301fb3bc
--- /dev/null
+++ b/auto_gpt_benchmarking/Evaluator.py
@@ -0,0 +1,61 @@
+"""
+The evaluator class actually executes the evals.
+"""
+from evals.cli import oaieval
+from evals.registry import Registry
+from pathlib import Path
+from typing import List, Optional, Tuple
+import sys
+
+
+class OAIRunArgs:
+    def __init__(
+        self,
+        completion_fn: str,
+        eval: str,
+        extra_eval_params: str = "",
+        max_samples: int = None,
+        cache: bool = True,
+        visible: bool = None,
+        seed: int = 20220722,
+        user: str = "",
+        record_path: str = None,
+        log_to_file: str = None,
+        debug: bool = False,
+        local_run: bool = True,
+        dry_run: bool = False,
+        dry_run_logging: bool = True,
+    ):
+        self.completion_fn = completion_fn
+        self.eval = eval
+        self.extra_eval_params = extra_eval_params
+        self.max_samples = max_samples
+        self.cache = cache
+        self.visible = visible
+        self.seed = seed
+        self.user = user
+        self.record_path = record_path
+        self.log_to_file = log_to_file
+        self.debug = debug
+        self.local_run = local_run
+        self.dry_run = dry_run
+        self.dry_run_logging = dry_run_logging
+        # create the record and logging paths if they don't exist
+        Path(self.record_path).parent.mkdir(parents=True, exist_ok=True)
+        # Path(self.log_to_file).parent.mkdir(parents=True, exist_ok=True)
+        # Registry path should be the auto_gpt_benchmarking folder
+        self.registry_path = None
+
+
+class Evaluator:
+    def __init__(self, oai_run_args: OAIRunArgs):
+        self.oai_run_args = oai_run_args
+        registry_path = Path(__file__).parent
+
+        # add registry path to the python system path
+        sys.path.append(str(registry_path))
+        self.oai_run_args.registry_path = [registry_path]
+        # self.registry = Registry([registry_path])
+
+    def run(self):
+        oaieval.run(self.oai_run_args)
diff --git a/auto_gpt_benchmarking/LangChainCompletions.py b/auto_gpt_benchmarking/LangChainCompletions.py
deleted file mode 100644
index 17f52bfa1..000000000
--- a/auto_gpt_benchmarking/LangChainCompletions.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import importlib
-from typing import Optional
-from evals.api import CompletionFn, CompletionResult
-
-from langchain.llms import BaseLLM
-
-from evals.prompt.base import CompletionPrompt
-from evals.record import record_sampling
-
-
-class LangChainLLMCompletionResult(CompletionResult):
-    def __init__(self, response) -> None:
-        self.response = response
-
-    def get_completions(self) -> list[str]:
-        return [self.response.strip()]
-
-
-class LangChainLLMCompletionFn(CompletionFn):
-    def __init__(self, llm: str, llm_kwargs: Optional[dict] = {}, **kwargs) -> None:
-        # Import and resolve self.llm to an instance of llm argument here, assuming it's always a subclass of BaseLLM
-        module = importlib.import_module("langchain.llms")
-        LLMClass = getattr(module, llm)
-
-        if issubclass(LLMClass, BaseLLM):
-            self.llm = LLMClass(**llm_kwargs)
-        else:
-            raise ValueError(f"{llm} is not a subclass of BaseLLM")
-
-    def __call__(self, prompt, **kwargs) -> LangChainLLMCompletionResult:
-        prompt = CompletionPrompt(prompt).to_formatted_prompt()
-        response = self.llm(prompt)
-        record_sampling(prompt=prompt, sampled=response)
-        return LangChainLLMCompletionResult(response)
diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py
new file mode 100644
index 000000000..06f5145ce
--- /dev/null
+++ b/auto_gpt_benchmarking/__main__.py
@@ -0,0 +1,144 @@
+"""
+This is the main evaluation file. In it you can specify the following:
+
+1. The number of threads to use for evaluation. This is set to 1 by default.And will remain that way until we can spin
+ up containers on command
+2. The timeout for each thread. This is set to 60 seconds by default. This is the amount of time each thread will run
+ for before it is killed when evaluating an agent
+3. The path to the AutoGPT code. This is a required parameter as we do not know where your code lives.
+4. The evals you would like to run. The options here are any OpenAI eval, or any of the evals defined in this repository
+
+
+What this file does is it parses the params given and then runs the evals with OpenAI's evals framework.
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from datetime import datetime
+import yaml
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("eval", type=str, help="Name of an eval. See registry.")
+    parser.add_argument(
+        "--completion-fn",
+        type=str,
+        dest="completion_fn",
+        default="auto_gpt_completion_fn",
+        help="One or more CompletionFn URLs, separated by commas (,). "
+             "A CompletionFn can either be the name of a model available in the OpenAI API or a key in the registry "
+             "(see evals/registry/completion_fns).",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help="The timeout for each thread",
+    )
+    parser.add_argument(
+        "--auto-gpt-path",
+        type=str,
+        default=None,
+        help="The path to the AutoGPT code. This updates auto_gpt_competion_fn.yaml in completion fns. "
+             "So you only need to set this once.",
+    )
+    parser.add_argument("--extra_eval_params", type=str, default="")
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None)
+    parser.add_argument("--seed", type=int, default=20220722)
+    parser.add_argument("--user", type=str, default="")
+    parser.add_argument("--record_path", type=str, default=str(Path(__file__).parent.parent / "data" / "records.jsonl"))
+    parser.add_argument(
+        "--log_to_file", type=str, default=None,#default=str(
+         #   Path(__file__).parent.parent / "data" / "log" / "log.txt"
+      #  ), help="Log to a file instead of stdout"
+    )
+    parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False)
+    parser.add_argument("--local-run", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument("--dry-run", action=argparse.BooleanOptionalAction, default=False)
+    parser.add_argument("--dry-run-logging", action=argparse.BooleanOptionalAction, default=True)
+    return parser.parse_args()
+
+
+def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) -> Path:
+    """
+    If there is a given auto_gpt_path, then we need to update the yaml file to include it in the system path
+    If we don't have one. Then we get the path from the yaml.
+    If none exists in the yaml and we don't have a path then we raise an exception.
+    :param yaml_path: The path to the yaml file
+    :param auto_gpt_path: The path to the AutoGPT code
+    :return: The path to the AutoGPT code
+    """
+    with open(yaml_path, "r") as f:
+        yaml_data = yaml.safe_load(f)
+    if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None:
+        raise Exception("You must specify a auto_gpt_path in the yaml file or pass it in as a parameter")
+    if auto_gpt_path is None:
+        auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"]
+    if auto_gpt_path is not None:
+        yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] = auto_gpt_path
+    with open(yaml_path, "w") as f:
+        yaml.safe_dump(yaml_data, f)
+
+    return Path(auto_gpt_path).absolute()
+
+
+def load_env_file(env_path: Path):
+    if not env_path.exists():
+        raise FileNotFoundError('You must set the OpenAI key in the AutoGPT env file. '
+                                'We need your api keys to start the AutoGPT agent and use OpenAI evals')
+    with open(env_path, "r") as f:
+        # find the OPENAI_API_KEY key split it from the equals sign and assign it so OpenAI evals can use it.
+        for line in f.readlines():
+            if line.startswith("OPENAI_API_KEY"):
+                os.environ["OPENAI_API_KEY"] = line.split("=")[1].strip()
+                break
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    # do not run in multiprocessing mode We do not use this right now, as it disables OpenAI's timeouts :(
+    # os.environ["EVALS_SEQUENTIAL"] = "1"
+    os.environ["EVALS_THREAD_TIMEOUT"] = str(args.timeout)
+    os.environ["EVALS_THREADS"] = str(1)
+
+    # Update the yaml file with the auto_gpt_path
+    autogpt_path = update_yaml_with_auto_gpt_path(
+        str(Path(__file__).parent / "completion_fns" / "auto_gpt_completion_fn.yaml"),
+        args.auto_gpt_path
+    )
+
+    # Add the benchmarks path to the system path so we can import auto_gpt_benchmarking
+    sys.path.append(str(Path(__file__).parent.parent.absolute()))
+
+    # load all of the environment variables in the auto-gpt path/.env file
+    load_env_file(Path(autogpt_path) / ".env")
+
+    # Obviously, a top level import would be better. This allows us to set the API key with the env file, as it gets
+    # set in the evaluator. We can't set it before the import because the import will fail without an API key.
+    from auto_gpt_benchmarking.Evaluator import Evaluator, OAIRunArgs
+    run_args = OAIRunArgs(
+        completion_fn=args.completion_fn,
+        eval=args.eval,
+        extra_eval_params=args.extra_eval_params,
+        max_samples=args.max_samples,
+        cache=args.cache,
+        visible=args.visible,
+        seed=args.seed,
+        user=args.user,
+        record_path=args.record_path,
+        log_to_file=args.log_to_file,
+        debug=args.debug,
+        local_run=args.local_run,
+        dry_run=args.dry_run,
+        dry_run_logging=args.dry_run_logging)
+
+    # Run the evals
+    evaluator = Evaluator(
+        run_args
+    )
+    evaluator.run()
diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
index d6a55a29b..a101f000a 100644
--- a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
+++ b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
@@ -1,2 +1,4 @@
 auto_gpt_completion_fn:
-  class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn
\ No newline at end of file
+  args:
+    auto_gpt_path:
+  class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn
diff --git a/requirements.txt b/requirements.txt
index a59bcbdd3..b1c5914ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,81 @@
-evals
\ No newline at end of file
+aiodocker==0.21.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+asn1crypto==1.5.1
+async-timeout==4.0.2
+attrs==23.1.0
+backoff==2.2.1
+blobfile==2.0.1
+cachetools==5.3.0
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==2.1.1
+click==8.1.3
+colorama==0.4.6
+contourpy==1.0.7
+cryptography==40.0.2
+cycler==0.11.0
+dataclasses-json==0.5.7
+docker==6.0.1
+evals==1.0.2.post1
+filelock==3.11.0
+fire==0.5.0
+fonttools==4.39.3
+frozenlist==1.3.3
+gptcache==0.1.13
+greenlet==2.0.2
+idna==3.4
+importlib-resources==5.12.0
+joblib==1.2.0
+kiwisolver==1.4.4
+langchain==0.0.142
+langdetect==1.0.9
+lxml==4.9.2
+lz4==4.3.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+matplotlib==3.7.1
+mock==5.0.2
+multidict==6.0.4
+mypy==1.2.0
+mypy-extensions==1.0.0
+nltk==3.8.1
+numexpr==2.8.4
+numpy==1.24.2
+openai==0.27.4
+openapi-schema-pydantic==1.2.4
+oscrypto==1.3.0
+packaging==23.1
+pandas==1.5.3
+Pillow==9.5.0
+portalocker==2.7.0
+pyarrow==10.0.1
+pycparser==2.21
+pycryptodomex==3.17
+pydantic==1.10.7
+PyJWT==2.6.0
+pyOpenSSL==23.1.1
+pyparsing==3.0.9
+python-dateutil==2.8.2
+pytz==2023.3
+PyYAML==6.0
+pyzstd==0.15.6
+regex==2023.3.23
+requests==2.28.2
+sacrebleu==2.3.1
+setuptools-scm==7.1.0
+six==1.16.0
+snowflake-connector-python==3.0.2
+SQLAlchemy==1.4.47
+tabulate==0.9.0
+tenacity==8.2.2
+termcolor==2.2.0
+tiktoken==0.3.3
+tomli==2.0.1
+tqdm==4.65.0
+typing-inspect==0.8.0
+typing_extensions==4.5.0
+urllib3==1.26.15
+websocket-client==1.5.1
+yarl==1.8.2
+zipp==3.15.0
-- 
cgit v1.2.3


From 011ed2f2b97840921539dc385891ebf9f7701e78 Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Thu, 20 Apr 2023 15:47:15 -0400
Subject: Update README.md (#17)

remove -m
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5a75d5a33..fe8bb6b04 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ See the AutoGPT docs on how to do that.
 Run your first eval with:
 
     `cd Auto-GPT-Benchmarks`
-    `python3 -m auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT`
+    `python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT`
 
 You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in 
 
-- 
cgit v1.2.3


From ef5c4f8a11b23667860acf0e6689ec195d056bd2 Mon Sep 17 00:00:00 2001
From: Media <12145726+rihp@users.noreply.github.com>
Date: Fri, 21 Apr 2023 01:04:34 +0200
Subject: Graphs for evals (#20)

* Update README.md

* Jupyter Notebook for evaluating eval results

---------

Co-authored-by: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
---
 auto_gpt_benchmarking/__main__.py |  38 ++++---
 evals_analytics.ipynb             | 220 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 245 insertions(+), 13 deletions(-)
 create mode 100644 evals_analytics.ipynb

diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py
index 06f5145ce..c42c73b8e 100644
--- a/auto_gpt_benchmarking/__main__.py
+++ b/auto_gpt_benchmarking/__main__.py
@@ -18,11 +18,14 @@ import sys
 from pathlib import Path
 from datetime import datetime
 import yaml
+from datetime import datetime
+
 
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
-    parser.add_argument("eval", type=str, help="Name of an eval. See registry.")
+    parser.add_argument(
+        "eval", type=str, help="Name of an eval. See registry.")
     parser.add_argument(
         "--completion-fn",
         type=str,
@@ -47,20 +50,27 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument("--extra_eval_params", type=str, default="")
     parser.add_argument("--max_samples", type=int, default=None)
-    parser.add_argument("--cache", action=argparse.BooleanOptionalAction, default=True)
-    parser.add_argument("--visible", action=argparse.BooleanOptionalAction, default=None)
+    parser.add_argument(
+        "--cache", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument(
+        "--visible", action=argparse.BooleanOptionalAction, default=None)
     parser.add_argument("--seed", type=int, default=20220722)
     parser.add_argument("--user", type=str, default="")
-    parser.add_argument("--record_path", type=str, default=str(Path(__file__).parent.parent / "data" / "records.jsonl"))
+    parser.add_argument("--record_path", type=str, default=str(Path(
+        __file__).parent.parent / "data" / f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl"))
     parser.add_argument(
-        "--log_to_file", type=str, default=None,#default=str(
-         #   Path(__file__).parent.parent / "data" / "log" / "log.txt"
-      #  ), help="Log to a file instead of stdout"
+        "--log_to_file", type=str, default=None,  # default=str(
+        #   Path(__file__).parent.parent / "data" / "log" / "log.txt"
+        #  ), help="Log to a file instead of stdout"
     )
-    parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False)
-    parser.add_argument("--local-run", action=argparse.BooleanOptionalAction, default=True)
-    parser.add_argument("--dry-run", action=argparse.BooleanOptionalAction, default=False)
-    parser.add_argument("--dry-run-logging", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument(
+        "--debug", action=argparse.BooleanOptionalAction, default=False)
+    parser.add_argument(
+        "--local-run", action=argparse.BooleanOptionalAction, default=True)
+    parser.add_argument(
+        "--dry-run", action=argparse.BooleanOptionalAction, default=False)
+    parser.add_argument("--dry-run-logging",
+                        action=argparse.BooleanOptionalAction, default=True)
     return parser.parse_args()
 
 
@@ -76,7 +86,8 @@ def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) -
     with open(yaml_path, "r") as f:
         yaml_data = yaml.safe_load(f)
     if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None:
-        raise Exception("You must specify a auto_gpt_path in the yaml file or pass it in as a parameter")
+        raise Exception(
+            "You must specify a auto_gpt_path in the yaml file or pass it in as a parameter")
     if auto_gpt_path is None:
         auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"]
     if auto_gpt_path is not None:
@@ -108,7 +119,8 @@ if __name__ == "__main__":
 
     # Update the yaml file with the auto_gpt_path
     autogpt_path = update_yaml_with_auto_gpt_path(
-        str(Path(__file__).parent / "completion_fns" / "auto_gpt_completion_fn.yaml"),
+        str(Path(__file__).parent / "completion_fns" /
+            "auto_gpt_completion_fn.yaml"),
         args.auto_gpt_path
     )
 
diff --git a/evals_analytics.ipynb b/evals_analytics.ipynb
new file mode 100644
index 000000000..f1b48424c
--- /dev/null
+++ b/evals_analytics.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAc5klEQVR4nO3deZgddZ3v8feHALLKYoLGrCiIoBDABlRQQBEiKsHHLVEUvGDmOuK43RnBmQcYnHH0ehVHwYHoRERlEREmwwQRRhEVkTSILEEkRiAJYBrCpiAY+Nw/6tfDSVOdPunu6pNOf17Pc56u+v1q+Vaf5Hy6llMl20RERPS1UacLiIiI9VMCIiIiaiUgIiKiVgIiIiJqJSAiIqJWAiIiImolICI6TNJrJN3e6Toi+kpAREdJukrSg5Ke0+lamiDpIEnLa9qvknQcgO2f2t6ljWWdIunbTdQZUScBER0jaTrwGsDAESO87o1Hcn3ru/w+ok4CIjrpfcC1wNnA0a0dkqZI+r6kHkkPSDq9pe8Dkm6T9KikxZL2Lu2WtFPLdGdL+qcyfJCk5ZI+Kek+4BuStpN0aVnHg2V4csv820v6hqR7Sv8lpf0WSW9pmW4TSfdL2mswv4S+exmlxhVl+26X9HpJM4FPAe+S9EdJvy7TvlDSAkmrJC2R9IGW5Wwu6Zul9tsk/V2f9dxZ1nUT8CdJG0s6QdLvWn63b22Z/hhJP5d0mqSHJC2V9OrSvkzSSklrvI8xuiUgopPeB3ynvA6T9HwASeOAS4G7gOnAJOD80vcO4JQy73Op9jweaHN9LwC2B6YBc6n+/X+jjE8FHgdOb5n+W8AWwMuAHYDTSvs5wFEt0x0O3Gv7V23W0S9JuwDHA/vY3ho4DLjT9g+AzwAX2N7K9owyy/nAcuCFwNuBz0h6Xek7mer39yLgDX1q7jUHeBOwre3VwO+o9uq2Af4R+LakiS3T7wfcBDwPOLesfx9gp7L80yVtNdTfQ6wnbOeV14i/gAOAvwDjy/hvgI+V4VcBPcDGNfNdDnykn2Ua2Kll/Gzgn8rwQcCTwGZrqWlP4MEyPBF4GtiuZroXAo8Czy3j3wP+rp9lHlSW81Cf12rguJZplpfhnYCVwCHAJn2WdQrw7ZbxKcBTwNYtbf8CnF2GlwKHtfQd17ueMn4n8L8GeJ9uBGaV4WOAO1r6di+/8+e3tD0A7Nnpf195Dc8rexDRKUcDP7R9fxk/l2cOM00B7nL1F21fU6j+yh2MHtt/7h2RtIWksyTdJekR4Gpg27IHMwVYZfvBvguxfQ/wc+BtkrYF3ki1F9Sfe2xv2/oCflY3oe0lwEepwmClpPMlvbCf5b6w1PhoS9tdVHtcvf3LWvpah2vbJL1P0o3lENJDwMuB8S2T/KFl+PFSc9+27EFsIBIQMeIkbQ68EzhQ0n3lnMDHgBmSZlB9aE3t58TpMuDF/Sz6MapDQr1e0Ke/762LPwHsAuxn+7nAa3tLLOvZvgRAnW9SHVJ5B/AL2yv6mW6d2T7X9gFUh74MfK6f+u8pNW7d0jYV6K3lXmByS9+UutX1DkiaBnyN6hDX80qQ3UL1+4gxKAERnXAk1aGR3agO6+wJ7Ar8lOrcwnVUH26flbSlpM0k7V/m/TrwfyS9QpWdygcbVIdD3i1pXDmpe+AAdWxN9RfvQ5K2pzpmD4Dte4HLgK+Wk9mbSHpty7yXAHsDH6E6JzEsJO0i6XWqLvv9c6nv6dL9B2C6pI1KjcuAa4B/Kb+jPYBjgd5LYb8LnFjqn0T1wb82W1IFRk+p5f1UexAxRiUgohOOBr5h+27b9/W+qE4Qv4fqL9a3UB2Pv5vqJOy7AGxfCPwz1SGpR6k+qLcvy/1Ime+hspxLBqjjS8DmwP1UV1P9oE//e6nOk/yG6rzAR3s7bD8OXATsCHy/7S0f2HOAz5aa7qM6OX5i6buw/HxA0g1leA7Vieh7gIuBk21fWfpOpfrd/R64kupcyRP9rdj2YuALwC+owmh3qkNpMUbJzgODIgZD0knAS2zXXR203pH0QWC27YH2rCKA7EFEDEo5JHUsMK/TtfRH0kRJ+0vaqFw++wmqvYyItiQgItZR+TLaMuAy21d3up612BQ4i+pQ3I+A/wC+2tGKYlTJIaaIiKiVPYiIiKi1Qd2ga/z48Z4+fXqny4iIGDWuv/76+21PqOvboAJi+vTpdHd3d7qMiIhRQ9Jd/fXlEFNERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStxgJC1TOFf1yea3urpI/UTCNJXy7P0r1J5dnCpe9oSXeUV55zGxExwpr8HsRq4BO2bygPNLle0hXllsK93gjsXF77Af8G7Ndyb/4uqvvTXy9pQd3TvSIiohmN7UHYvtf2DWX4UeA2nnkUYq9ZwDmuXEv1uMeJVA9qv8J27yMfrwBmNlVrREQ824icg5A0HdgL+GWfrkms+Uzc5aWtv/a6Zc+V1C2pu6enZ9hqjohYn02cPBVJSGLi5KmNrKPxgJC0FdWTtz5q+5HhXr7teba7bHdNmFB7O5GIiA3OfSuWMe2TlzLtk5dy34plA88wCI0GhKRNqMLhO7brHsu4gjUfpD65tPXXHhERI6TJq5gE/Dtwm+0v9jPZAuB95WqmVwIPl4fFXw4cWh62vh1waGmLiIgR0uRVTPtTPfT9Zkk3lrZPAVMBbJ8JLAQOB5YAjwHvL32rJH0aWFTmO9X2qgZrjYiIPhoLCNs/AzTANAY+1E/ffGB+A6VFREQb8k3qiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNPTBI0nzgzcBK2y+v6f9b4D0tdewKTChPk7sTeBR4Clhtu6upOiMiol6TexBnAzP767T9edt72t4TOBH4SZ/Hih5c+hMOEREd0FhA2L4aaPc50nOA85qqJSIi1l3Hz0FI2oJqT+OilmYDP5R0vaS5naksImJsa+wcxDp4C/DzPoeXDrC9QtIOwBWSflP2SJ6lBMhcgKlTpzZfbUTEGNHxPQhgNn0OL9leUX6uBC4G9u1vZtvzbHfZ7powYUKjhUZEjCUdDQhJ2wAHAv/R0ralpK17h4FDgVs6U2FExNjV5GWu5wEHAeMlLQdOBjYBsH1mmeytwA9t/6ll1ucDF0vqre9c2z9oqs6IiKjXWEDYntPGNGdTXQ7b2rYUmNFMVRER0a714RxERESshxIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUaCwhJ8yWtlFT7PGlJB0l6WNKN5XVSS99MSbdLWiLphKZqjIiI/jW5B3E2MHOAaX5qe8/yOhVA0jjgDOCNwG7AHEm7NVhnRETUaCwgbF8NrBrErPsCS2wvtf0kcD4wa1iLi4iIAXX6HMSrJP1a0mWSXlbaJgHLWqZZXtpqSZorqVtSd09PT5O1RkSMKZ0MiBuAabZnAF8BLhnMQmzPs91lu2vChAnDWV9ExJjWsYCw/YjtP5bhhcAmksYDK4ApLZNOLm0RETGCOhYQkl4gSWV431LLA8AiYGdJO0raFJgNLOhUnRERY9XGTS1Y0nnAQcB4ScuBk4FNAGyfCbwd+KCk1cDjwGzbBlZLOh64HBgHzLd9a1N1RkREvcYCwvacAfpPB07vp28hsLCJuiIioj2dvoopIiLWUwmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNBYSk+ZJWSrqln/73SLpJ0s2SrpE0o6XvztJ+o6TupmqMiIj+tRUQknYfxLLPBmaupf/3wIG2dwc+Dczr03+w7T1tdw1i3RERMUTt7kF8VdJ1kv5a0jbtzGD7amDVWvqvsf1gGb0WmNxmLRERMQLaCgjbrwHeA0wBrpd0rqQ3DGMdxwKXta4S+KGk6yXNXduMkuZK6pbU3dPTM4wlRUSMbRu3O6HtOyT9A9ANfBnYS5KAT9n+/mALkHQwVUAc0NJ8gO0VknYArpD0m7JHUlfXPMrhqa6uLg+2joiIWFO75yD2kHQacBvwOuAttnctw6cNduWS9gC+Dsyy/UBvu+0V5edK4GJg38GuIyIiBqfdcxBfAW4AZtj+kO0bAGzfA/zDYFYsaSrwfeC9tn/b0r6lpK17h4FDgdoroSIiojntHmJ6E/C47acAJG0EbGb7MdvfqptB0nnAQcB4ScuBk4FNAGyfCZwEPI/qBDjA6nLF0vOBi0vbxsC5tn8wuM2LiIjBajcgrgQOAf5YxrcAfgi8ur8ZbM9Z2wJtHwccV9O+FJjx7DkiImIktXuIaTPbveFAGd6imZIiImJ90G5A/EnS3r0jkl4BPN5MSRERsT5o9xDTR4ELJd0DCHgB8K6mioqIiM5rKyBsL5L0UmCX0nS77b80V1ZERHRa21+UA/YBppd59paE7XMaqSoiIjqurYCQ9C3gxcCNwFOl2UACIiJiA9XuHkQXsJvt3MoiImKMaPcqpluoTkxHRMQY0e4exHhgsaTrgCd6G20f0UhVERHRce0GxClNFhEREeufdi9z/YmkacDOtq+UtAUwrtnSIiKik9q93fcHgO8BZ5WmScAlDdUUERHrgXZPUn8I2B94BKqHBwE7NFVURER0XrsB8YTtJ3tHJG1M9T2IiIjYQLUbED+R9Clg8/Is6guB/2yurIiI6LR2A+IEoAe4GfgrYCGDfJJcRESMDu1exfQ08LXyioiIMaDdq5h+L2lp31cb882XtFJS7TOlVfmypCWSburzzImjJd1RXke3v0kRETEc1uVeTL02A94BbN/GfGcDp9P/Tf3eCOxcXvsB/wbsJ2l7qmdYd1GdDL9e0gLbD7ZZb0REDFFbexC2H2h5rbD9JeBNbcx3NbBqLZPMAs5x5VpgW0kTgcOAK2yvKqFwBTCznVojImJ4tHuIae+WV5ek/826PUuiP5OAZS3jy0tbf+11tc2V1C2pu6enZxhKinZMnDwVSUhi4uSpo275se7ynow97X7If6FleDVwJ/DOYa9mEGzPA+YBdHV15bsZI+S+FcuY9slLAbjrc28edcuPdZf3ZOxp9yqmgxta/wpgSsv45NK2AjioT/tVDdUQERE12n2i3MfX1m/7i4Nc/wLgeEnnU52kftj2vZIuBz4jabsy3aHAiYNcR0REDMK6XMW0D9UHOsBbgOuAO9Y2k6TzqPYExktaTnVl0iYAts+k+sLd4cAS4DHg/aVvlaRPA4vKok61vbaT3RERMczaDYjJwN62HwWQdArwX7aPWttMtucM0G+qGwHW9c0H5rdZX0REDLN2b7XxfODJlvEnS1tERGyg2t2DOAe4TtLFZfxI4JuNVBQREeuFdq9i+mdJlwGvKU3vt/2r5sqKiIhOa/cQE8AWwCO2/xVYLmnHhmqKiIj1QLvfpD4Z+CTPXGq6CfDtpoqKiIjOa3cP4q3AEcCfAGzfA2zdVFEREdF57QbEk+WSVANI2rK5kiIiYn3QbkB8V9JZVHdb/QBwJXl4UETEBm3Aq5gkCbgAeCnwCLALcJLtKxquLSIiOmjAgLBtSQtt7071XIaIiBgD2j3EdIOkfRqtJCIi1ivtfpN6P+AoSXdSXckkqp2LPZoqLCIiOmutASFpqu27qR4BGhERY8hAexCXUN3F9S5JF9l+2wjUFBER64GBzkGoZfhFTRYSERHrl4ECwv0MR0TEBm6gQ0wzJD1CtSexeRmGZ05SP7fR6iIiomPWGhC2xw1l4ZJmAv8KjAO+bvuzffpPAw4uo1sAO9jetvQ9Bdxc+u62fcRQaomIiHXT7mWu60zSOOAM4A3AcmCRpAW2F/dOY/tjLdN/GNirZRGP296zqfoiImLt1uV5EOtqX2CJ7aW2nwTOB2atZfo5wHkN1hMREeugyYCYBCxrGV9e2p5F0jRgR+BHLc2bSeqWdK2kI/tbiaS5Zbrunp6eYSg7IiKg2YBYF7OB79l+qqVtmu0u4N3AlyS9uG5G2/Nsd9numjBhwkjUGhExJjQZECuAKS3jk0tbndn0Obxke0X5uRS4ijXPT0RERMOaDIhFwM6SdpS0KVUILOg7kaSXAtsBv2hp207Sc8rweGB/YHHfeSMiojmNXcVke7Wk44HLqS5znW/7VkmnAt22e8NiNnB+eWJdr12BsyQ9TRVin229+ikiIprXWEAA2F4ILOzTdlKf8VNq5rsG2L3J2iIiYu3Wl5PUERGxnklARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRqNCAkzZR0u6Qlkk6o6T9GUo+kG8vruJa+oyXdUV5HN1lnREQ8W2OPHJU0DjgDeAOwHFgkaUHNs6UvsH18n3m3B04GugAD15d5H2yq3oiIWFOTexD7AktsL7X9JHA+MKvNeQ8DrrC9qoTCFcDMhuqMiIgaTQbEJGBZy/jy0tbX2yTdJOl7kqas47xImiupW1J3T0/PcNQdERF0/iT1fwLTbe9BtZfwzXVdgO15trtsd02YMGHYC4yIGKuaDIgVwJSW8cml7X/YfsD2E2X068Ar2p03IiKa1WRALAJ2lrSjpE2B2cCC1gkkTWwZPQK4rQxfDhwqaTtJ2wGHlraIiBghjV3FZHu1pOOpPtjHAfNt3yrpVKDb9gLgbyQdAawGVgHHlHlXSfo0VcgAnGp7VVO1RkTEszUWEAC2FwIL+7Sd1DJ8InBiP/POB+Y3WV9ERPSv0yepIyJiPZWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKjVaEBIminpdklLJJ1Q0/9xSYsl3STpvyVNa+l7StKN5bWg77wREdGsxh45KmkccAbwBmA5sEjSAtuLWyb7FdBl+zFJHwT+L/Cu0ve47T2bqi8iItauyT2IfYEltpfafhI4H5jVOoHtH9t+rIxeC0xusJ6IiFgHTQbEJGBZy/jy0tafY4HLWsY3k9Qt6VpJR/Y3k6S5Zbrunp6eIRUcERHPaOwQ07qQdBTQBRzY0jzN9gpJLwJ+JOlm27/rO6/tecA8gK6uLo9IwRERY0CTexArgCkt45NL2xokHQL8PXCE7Sd6222vKD+XAlcBezVYa0RE9NFkQCwCdpa0o6RNgdnAGlcjSdoLOIsqHFa2tG8n6TlleDywP9B6cjsiIhrW2CEm26slHQ9cDowD5tu+VdKpQLftBcDnga2ACyUB3G37CGBX4CxJT1OF2Gf7XP0UERENa/QchO2FwMI+bSe1DB/Sz3zXALs3WVtERKxdvkkdERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRoNCEkzJd0uaYmkE2r6nyPpgtL/S0nTW/pOLO23SzqsyTojIuLZGgsISeOAM4A3ArsBcyTt1meyY4EHbe8EnAZ8rsy7GzAbeBkwE/hqWV5ERIyQJvcg9gWW2F5q+0ngfGBWn2lmAd8sw98DXi9Jpf1820/Y/j2wpCwvIiJGyMYNLnsSsKxlfDmwX3/T2F4t6WHgeaX92j7zTqpbiaS5wNwy+kdJtw+y3vHA/YOcd7Qa0jbf9bk3/89wlevDq6Hlj7X3eVi3t+n3fJiMmfe45f0YL2mw2zytv44mA2JE2J4HzBvqciR12+4ahpJGjWzzhm+sbS9km4dTk4eYVgBTWsYnl7baaSRtDGwDPNDmvBER0aAmA2IRsLOkHSVtSnXSeUGfaRYAR5fhtwM/su3SPrtc5bQjsDNwXYO1RkREH40dYirnFI4HLgfGAfNt3yrpVKDb9gLg34FvSVoCrKIKEcp03wUWA6uBD9l+qqlaiyEfphqFss0bvrG2vZBtHjaq/mCPiIhYU75JHRERtRIQERFRa8wFxFBu/zEatbG9H5e0WNJNkv5bUr/XRI8WA21zy3Rvk2RJo/6SyHa2WdI7y3t9q6RzR7rG4dbGv+2pkn4s6Vfl3/fhnahzuEiaL2mlpFv66ZekL5ffx02S9h7ySm2PmRfVyfLfAS8CNgV+DezWZ5q/Bs4sw7OBCzpdd8PbezCwRRn+4Gje3na3uUy3NXA11Rcyuzpd9wi8zzsDvwK2K+M7dLruEdjmecAHy/BuwJ2drnuI2/xaYG/gln76DwcuAwS8EvjlUNc51vYghnL7j9FowO21/WPbj5XRa6m+czKatfMeA3ya6t5ffx7J4hrSzjZ/ADjD9oMAtleOcI3DrZ1tNvDcMrwNcM8I1jfsbF9NdbVnf2YB57hyLbCtpIlDWedYC4i623/0vYXHGrf/AHpv/zEatbO9rY6l+gtkNBtwm8uu9xTb/zWShTWonff5JcBLJP1c0rWSZo5Ydc1oZ5tPAY6StBxYCHx4ZErrmHX9/z6gUX+rjRgeko4CuoADO11LkyRtBHwROKbDpYy0jakOMx1EtZd4taTdbT/UyaIaNgc42/YXJL2K6jtXL7f9dKcLGy3G2h7EUG7/MRq1dcsSSYcAfw8cYfuJEaqtKQNt89bAy4GrJN1Jdax2wSg/Ud3O+7wcWGD7L67ukPxbqsAYrdrZ5mOB7wLY/gWwGdWN/DZUw36LorEWEEO5/cdoNOD2StoLOIsqHEb7cWkYYJttP2x7vO3ptqdTnXc5wnZ3Z8odFu38u76Eau8BSeOpDjktHcEah1s723w38HoASbtSBUTPiFY5shYA7ytXM70SeNj2vUNZ4Jg6xOQh3P5jNGpzez8PbAVcWM7F3237iI4VPURtbvMGpc1tvhw4VNJi4Cngb22P1j3jdrf5E8DXJH2M6oT1MaP4jz0knUcV8uPLeZWTgU0AbJ9JdZ7lcKrn5zwGvH/I6xzFv6+IiGjQWDvEFBERbUpARERErQRERETUSkBEREStBERERNRKQET0IenIcpfXl3a6lohOSkBEPNsc4GflZyMkjWtq2RHDJQER0ULSVsABVLdpmF3axkn6f5JuKffZ/3Bp30fSNZJ+Lek6SVtLOkbS6S3Lu1TSQWX4j5K+IOnXwKsknSRpUVnuvN67BkvaSdKVZbk3SHqxpHMkHdmy3O9IqrtLbcSwSUBErGkW8APbvwUekPQKYC4wHdjT9h7Ad8rtHS4APmJ7BnAI8PgAy96S6h79M2z/DDjd9j62Xw5sDry5TPcdqltzzwBeDdxL9Q3/YwAkbVPaN5S70cZ6KgERsaY5VM8WoPycQ/Xhf1a5/Tu2VwG7APfaXlTaHuntX4ungItaxg9W9dTCm4HXAS+TtDUwyfbFZbl/tv2Y7Z9Q3XtoQqnpojbWFzEkY+peTBFrI2l7qg/q3SWZ6h4/proxXLtWs+YfXpu1DP/Z9lNlXZsBX6V6mt0ySaf0mbbOOcBRVIe+hnyfnYiBZA8i4hlvB75le1q52+sU4PdUj7P8q3L7994guR2YKGmf0rZ16b8T2FPSRpKmUD35rE5vGNxfznu8HcD2o8Dy3vMNqp6RvkWZ9mzgo2W6xcO21RH9SEBEPGMOcHGftouAiVS3jr6pnGB+d3nM5buAr5S2K6g+9H9OFSqLgS8DN9StqDyo52vALVR3JG3dS3kv8DeSbgKuAV5Q5vkDcBvwjaFuaEQ7cjfXiFGi7EncDOxt++FO1xMbvuxBRIwC5al/twFfSTjESMkeRERE1MoeRERE1EpARERErQRERETUSkBEREStBERERNT6/5WLAWlxQhHkAAAAAElFTkSuQmCC",
+      "text/plain": [
+       "<Figure size 432x288 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def load_jsonl_files_recursively(dir_path):\n",
+    "    all_data = []\n",
+    "    \n",
+    "    for root, _, files in os.walk(dir_path):\n",
+    "        for file in files:\n",
+    "            if file.endswith(\".jsonl\"):\n",
+    "                file_path = os.path.join(root, file)\n",
+    "                with open(file_path, \"r\") as f:\n",
+    "                    file_data = [json.loads(line) for line in f]\n",
+    "                    all_data.extend(file_data)\n",
+    "    \n",
+    "    return all_data\n",
+    "\n",
+    "def extract_accuracies(data):\n",
+    "    accuracies = []\n",
+    "    for record in data:\n",
+    "        if 'final_report' in record:\n",
+    "            accuracy = record['final_report']['accuracy']\n",
+    "            accuracies.append(accuracy)\n",
+    "    return accuracies\n",
+    "\n",
+    "# Load the data recursively\n",
+    "dir_path = \"evals\"\n",
+    "data = load_jsonl_files_recursively(dir_path)\n",
+    "\n",
+    "# Extract accuracies from the data\n",
+    "accuracies = extract_accuracies(data)\n",
+    "\n",
+    "# Plot the accuracies in a histogram chart\n",
+    "plt.hist(accuracies, bins=100, range=(0, 1), edgecolor='black')\n",
+    "plt.xlabel(\"Accuracy\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.title(\"Accuracy Histogram\")\n",
+    "plt.show()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Counts for each eval_name:\n",
+      "test-match.s1.simple-v0: 22\n",
+      "None: 45\n",
+      "test-fuzzy-match.s1.simple-v0: 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def print_graph():\n",
+    "    directory = 'evals/'\n",
+    "    files = os.listdir(directory)\n",
+    "    \n",
+    "    eval_name_counter = {}\n",
+    "    \n",
+    "    for file in files:\n",
+    "        if file.endswith(\".jsonl\"):\n",
+    "            with open(os.path.join(directory, file), 'r') as f:\n",
+    "                jsonl_content = f.read()\n",
+    "                \n",
+    "            # Read the JSONL content into a DataFrame\n",
+    "            data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n",
+    "            df = pd.DataFrame(data)\n",
+    "\n",
+    "            if 'spec' not in df.columns:\n",
+    "                continue\n",
+    "\n",
+    "            # Extract the \"eval_name\" from the \"spec\" dictionaries\n",
+    "            df['eval_name'] = df['spec'].apply(lambda x: x['eval_name'] if isinstance(x, dict) else None)\n",
+    "\n",
+    "            for eval_name in df['eval_name']:\n",
+    "                if eval_name not in eval_name_counter:\n",
+    "                    eval_name_counter[eval_name] = 0\n",
+    "                eval_name_counter[eval_name] += 1\n",
+    "\n",
+    "    # Print the counts\n",
+    "    print(\"Counts for each eval_name:\")\n",
+    "    for eval_name, count in eval_name_counter.items():\n",
+    "        print(f\"{eval_name}: {count}\")\n",
+    "\n",
+    "print_graph()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Counts for each eval_name:\n",
+      "test-match.s1.simple-v0: 22\n",
+      "test-fuzzy-match.s1.simple-v0: 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "def print_graph():\n",
+    "    directory = 'evals/'\n",
+    "    files = os.listdir(directory)\n",
+    "    \n",
+    "    eval_name_counter = {}\n",
+    "    \n",
+    "    for file in files:\n",
+    "        if file.endswith(\".jsonl\"):\n",
+    "            with open(os.path.join(directory, file), 'r') as f:\n",
+    "                jsonl_content = f.read()\n",
+    "                \n",
+    "            # Read the JSONL content into a DataFrame\n",
+    "            data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n",
+    "            df = pd.DataFrame(data)\n",
+    "\n",
+    "            if 'spec' not in df.columns:\n",
+    "                continue\n",
+    "\n",
+    "            # Filter the DataFrame to only include rows with the \"spec\" key\n",
+    "            spec_df = df[df['spec'].notna()].copy()\n",
+    "\n",
+    "            # Extract the \"eval_name\" from the \"spec\" dictionaries\n",
+    "            spec_df.loc[:, 'eval_name'] = spec_df['spec'].apply(lambda x: x['eval_name'])\n",
+    "\n",
+    "            for eval_name in spec_df['eval_name']:\n",
+    "                if eval_name not in eval_name_counter:\n",
+    "                    eval_name_counter[eval_name] = 0\n",
+    "                eval_name_counter[eval_name] += 1\n",
+    "\n",
+    "    # Print the counts\n",
+    "    print(\"Counts for each eval_name:\")\n",
+    "    for eval_name, count in eval_name_counter.items():\n",
+    "        print(f\"{eval_name}: {count}\")\n",
+    "\n",
+    "print_graph()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.0 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
cgit v1.2.3


From b8c7c05dd5f88c02878ad028869bca81f500dd5d Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Sat, 22 Apr 2023 19:17:28 -0400
Subject: windows docs make workspace if not there (#25)

* windows docs make workspace if not there

* small fixes
---
 README.md                             | 43 +++++++++++++++++------------------
 auto_gpt_benchmarking/AutoGPTAgent.py |  3 +++
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index fe8bb6b04..c4a891861 100644
--- a/README.md
+++ b/README.md
@@ -13,54 +13,53 @@ documented.
 
 Clone the repo with:
 
-    `git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git`
-    `cd Auto-GPT-Benchmarks`
+    git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git
+    cd Auto-GPT-Benchmarks
 
 Create a venv with
 
-    `python3.9 -m venv venv`
+    python3.9 -m venv venv
 
 
-Activate it with
+On MaxOS/Linux Activate it with 
 
-    `source venv/bin/activate`
+    source venv/bin/activate
+
+On Windows:
+
+    venv/scripts/activate
 
 Install the requirements with:
 
-    `pip install -r requirements.txt`
+    pip install -r requirements.txt
 
 If you haven't already clone the AutoGPT repo somewhere else on your machine.
 DO NOT CLONE IT INTO A SUBDIR OF THIS REPO.
 
-    `cd somewhere/else`
-    `git clone git@github.com:Significant-Gravitas/Auto-GPT.git`
+    cd somewhere/else
+    git clone git@github.com:Significant-Gravitas/Auto-GPT.git
+    cd Auto-GPT
+    git checkout stable # Or the branch you want to benchmark
 
-You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at:
+You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at. This should becopied from the .env.template as described in the Auto-GPT README.md
 
-    `Auto-GPT/.env`
+    Auto-GPT/.env
 
 Finally, we assume you have a docker container built from the Dockerfile in the Auto-GPT repo.
 
 Build this with:
 
-    `cd Auto-GPT`
-    `docker build -t autogpt .`
-
-If you want to run with redis as your memory system, you can stand up a redis image in the AutoGPT repo with
-    
-    `docker compose up`
-
-Then you will need to adjust some variables in your .env file to use the redis memory backend. 
-See the AutoGPT docs on how to do that.
+    cd Auto-GPT
+    docker build -t autogpt .
 
 Run your first eval with:
 
-    `cd Auto-GPT-Benchmarks`
-    `python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT`
+    cd Auto-GPT-Benchmarks
+    python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT
 
 You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in 
 
-    `auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml`.
+    auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml.
 
 To see a full list of available flags you can use run `python3 -m auto_gpt_benchmarking --help`
 Some of these are inherited from the openAI evals framework and do not work quite as intended as they are not applicable
diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py
index 63cebf1cb..26d0f4e5b 100644
--- a/auto_gpt_benchmarking/AutoGPTAgent.py
+++ b/auto_gpt_benchmarking/AutoGPTAgent.py
@@ -108,6 +108,9 @@ class AutoGPTAgent:
     def __init__(self, prompt, auto_gpt_path: str):
         self.auto_gpt_path = Path(auto_gpt_path)
         self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace"
+        # if the workspace doesn't exist, create it
+        if not self.auto_workspace.exists():
+            self.auto_workspace.mkdir()
         self.prompt_file = self.auto_workspace / "prompt.txt"
         self.output_file = self.auto_workspace / "output.txt"
         self.file_logger = self.auto_workspace / "file_logger.txt"
-- 
cgit v1.2.3


From 04722e7fc5a5e24ef70b15e22be4dcff764c5367 Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Wed, 3 May 2023 10:14:44 -0400
Subject: EvalNames with dates for the eval run filename and compatibility with
 0.3.0 (#26)

* EvalNames with dates and the eval run

* Ignore .idea files, update readme to use 3.10, updates for 0.3.0
---
 .gitignore                            |  2 ++
 README.md                             |  2 +-
 auto_gpt_benchmarking/AutoGPTAgent.py | 13 ++++++++-----
 auto_gpt_benchmarking/__main__.py     |  6 ++++--
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.gitignore b/.gitignore
index e68877ae9..04a0b6b0e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,5 @@ dmypy.json
 .pyre/
 
 /data
+
+/.idea
diff --git a/README.md b/README.md
index c4a891861..8e0a63c40 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Clone the repo with:
 
 Create a venv with
 
-    python3.9 -m venv venv
+    python3.10 -m venv venv
 
 
 On MaxOS/Linux Activate it with 
diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py
index 26d0f4e5b..7a60009d9 100644
--- a/auto_gpt_benchmarking/AutoGPTAgent.py
+++ b/auto_gpt_benchmarking/AutoGPTAgent.py
@@ -80,15 +80,17 @@ class AutoGPTAgent:
         envs = [
             f"{line.strip()}" for line in open(
                 env_file
-            ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n"]
+            ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n" and "=" in line and not line.startswith('SMART_LLM_MODEL')]
+
+        envs.append("SMART_LLM_MODEL=gpt-3.5-turbo")
 
         self.container = client.containers.run(
             image="autogpt",
-            command="--continuous -C '/home/appuser/auto_gpt_workspace/ai_settings.yaml'",
+            command="--continuous -C '/app/auto_gpt_workspace/ai_settings.yaml' --skip-news",
             environment=envs,
             volumes={
-                self.auto_workspace: {"bind": "/home/appuser/auto_gpt_workspace", "mode": "rw"},
-                f"{self.auto_gpt_path}/autogpt": {"bind": "/home/appuser/autogpt", "mode": "rw"},
+                self.auto_workspace: {"bind": "/app/auto_gpt_workspace", "mode": "rw"},
+                f"{self.auto_gpt_path}/autogpt": {"bind": "/app/autogpt", "mode": "rw"},
             },
             stdin_open=True,
             tty=True,
@@ -103,11 +105,12 @@ class AutoGPTAgent:
         """
         while True:
             if self.output_file.exists():
+                print("Output file exists")
                 return self.output_file.read_text()
 
     def __init__(self, prompt, auto_gpt_path: str):
         self.auto_gpt_path = Path(auto_gpt_path)
-        self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace"
+        self.auto_workspace = self.auto_gpt_path / "autogpt" / "auto_gpt_workspace"
         # if the workspace doesn't exist, create it
         if not self.auto_workspace.exists():
             self.auto_workspace.mkdir()
diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py
index c42c73b8e..84761a65d 100644
--- a/auto_gpt_benchmarking/__main__.py
+++ b/auto_gpt_benchmarking/__main__.py
@@ -56,8 +56,7 @@ def parse_args() -> argparse.Namespace:
         "--visible", action=argparse.BooleanOptionalAction, default=None)
     parser.add_argument("--seed", type=int, default=20220722)
     parser.add_argument("--user", type=str, default="")
-    parser.add_argument("--record_path", type=str, default=str(Path(
-        __file__).parent.parent / "data" / f"eval-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl"))
+    parser.add_argument("--record_path", type=str, default=None)
     parser.add_argument(
         "--log_to_file", type=str, default=None,  # default=str(
         #   Path(__file__).parent.parent / "data" / "log" / "log.txt"
@@ -133,6 +132,9 @@ if __name__ == "__main__":
     # Obviously, a top level import would be better. This allows us to set the API key with the env file, as it gets
     # set in the evaluator. We can't set it before the import because the import will fail without an API key.
     from auto_gpt_benchmarking.Evaluator import Evaluator, OAIRunArgs
+    if args.record_path is None:
+        args.record_path = str(Path(
+            __file__).parent.parent / "data" / f"eval-{args.eval}-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl")
     run_args = OAIRunArgs(
         completion_fn=args.completion_fn,
         eval=args.eval,
-- 
cgit v1.2.3


From dfb73204bf8c278cc4f50155a7a71b14b55d8b3a Mon Sep 17 00:00:00 2001
From: Douglas Schonholtz <15002691+dschonholtz@users.noreply.github.com>
Date: Fri, 5 May 2023 16:33:39 -0400
Subject: Update readme to suggest people check out challenges

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 8e0a63c40..e84ff1af8 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,6 @@
+# Closing in favor of Challenges!
+Please check out challenges run in our CI pipeline: https://github.com/Significant-Gravitas/Auto-GPT/tree/master/tests/integration/challenges
+
 # Auto-GPT-Benchmarks
 A set of standardised benchmarks to assess the performance of Auto-GPT.
 This currently uses the OpenAI Evals framework to run the benchmarks.
-- 
cgit v1.2.3


From c6a22abb10c6c2d3d25814a24b269ad250945243 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 18 Jun 2023 07:30:54 -0700
Subject: Initial commit

---
 LICENSE   | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md |   2 +
 2 files changed, 203 insertions(+)
 create mode 100644 LICENSE
 create mode 100644 README.md

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..b42ff76ea
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# Auto-GPT-Factory
+Build your own Auto-GPT
-- 
cgit v1.2.3


From 7d4d51ccbbf9912c74353c8088d5c12ffec99823 Mon Sep 17 00:00:00 2001
From: Merwane Hamadi <merwanehamadi@gmail.com>
Date: Sun, 18 Jun 2023 07:55:16 -0700
Subject: Setup

---
 .gitignore          | 162 ++++++++++++++++++++++++++++++++++++++
 LICENSE             | 222 +++++-----------------------------------------------
 autogpt/__init__.py |   0
 tests/__init__.py   |   0
 4 files changed, 183 insertions(+), 201 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 autogpt/__init__.py
 create mode 100644 tests/__init__.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..29a0285a8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,162 @@
+## Original ignores
+autogpt/keys.py
+autogpt/*.json
+**/auto_gpt_workspace/*
+*.mpeg
+.env
+azure.yaml
+ai_settings.yaml
+last_run_ai_settings.yaml
+.vscode
+.idea/*
+auto-gpt.json
+log.txt
+log-ingestion.txt
+logs
+*.log
+*.mp3
+mem.sqlite3
+venvAutoGPT
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+plugins/
+plugins_config.yaml
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+site/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.direnv/
+.env
+.venv
+env/
+venv*/
+ENV/
+env.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+llama-*
+vicuna-*
+
+# mac
+.DS_Store
+
+openai/
+
+# news
+CURRENT_BULLETIN.md
diff --git a/LICENSE b/LICENSE
index 261eeb9e9..601935b85 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,201 +1,21 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+MIT License
+
+Copyright (c) 2023 Toran Bruce Richards
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/autogpt/__init__.py b/autogpt/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
-- 
cgit v1.2.3


From 51f2295971888026275bde4127945df8b182d731 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 18 Jun 2023 11:14:54 -0400
Subject: init agbenchmark

---
 .gitignore                                         |  37 +++-
 .gitmodules                                        |   0
 .vscode/settings.json                              |   6 +
 LICENSE                                            |   2 +-
 README.md                                          | 191 +++++++++---------
 agbenchmark/__init__.py                            |   0
 agbenchmark/benchmark/__init__.py                  |   0
 agbenchmark/benchmark/benchmark.py                 |   1 +
 agbenchmark/benchmark/challenges/Challenge.py      |   0
 agbenchmark/benchmark/challenges/__init__.py       |   0
 .../benchmark/challenges/adaptability/a1_test.py   |   0
 .../challenges/basic_abilities/browse_test.py      |   0
 .../challenges/basic_abilities/read_file_test.py   |   0
 .../basic_abilities/remember_context_test.py       |   0
 .../challenges/basic_abilities/write_file_test.py  |   0
 agbenchmark/benchmark/challenges/code/c1_test.py   |   0
 agbenchmark/benchmark/challenges/memory/m1_test.py |   0
 .../benchmark/challenges/retrieval/r1_test.py      |   0
 agbenchmark/benchmark/challenges/utils.py          |   0
 .../challenges/web_navigation/wn1_test.py          |   0
 .../benchmark/challenges/writing/w1_test.py        |   0
 agbenchmark/benchmark/run.py                       |   1 +
 agbenchmark/server/__init__.py                     |   0
 agbenchmark/server/api.py                          |   0
 agbenchmark/server/utils.py                        |   0
 agbenchmark/workspace/__init__.py                  |   0
 agbenchmark/workspace/cloud_services/aws.py        |   0
 agbenchmark/workspace/workspace_manager.py         |   1 +
 auto_gpt_benchmarking/AutoGPTAgent.py              | 155 ---------------
 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml |   6 -
 auto_gpt_benchmarking/CompletionFn.py              |  34 ----
 auto_gpt_benchmarking/Evaluator.py                 |  61 ------
 auto_gpt_benchmarking/__init__.py                  |   0
 auto_gpt_benchmarking/__main__.py                  | 158 ---------------
 .../completion_fns/auto_gpt_completion_fn.yaml     |   4 -
 evals_analytics.ipynb                              | 220 ---------------------
 poetry.lock                                        | 101 ++++++++++
 pyproject.toml                                     |  23 +++
 requirements.txt                                   |  81 --------
 tests/__init__.py                                  |   0
 tests/test_api.py                                  |   0
 tests/test_benchmark.py                            |   0
 tests/test_workspace_manager.py                    |   0
 43 files changed, 267 insertions(+), 815 deletions(-)
 delete mode 100644 .gitmodules
 create mode 100644 .vscode/settings.json
 create mode 100644 agbenchmark/__init__.py
 create mode 100644 agbenchmark/benchmark/__init__.py
 create mode 100644 agbenchmark/benchmark/benchmark.py
 create mode 100644 agbenchmark/benchmark/challenges/Challenge.py
 create mode 100644 agbenchmark/benchmark/challenges/__init__.py
 create mode 100644 agbenchmark/benchmark/challenges/adaptability/a1_test.py
 create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/browse_test.py
 create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py
 create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py
 create mode 100644 agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py
 create mode 100644 agbenchmark/benchmark/challenges/code/c1_test.py
 create mode 100644 agbenchmark/benchmark/challenges/memory/m1_test.py
 create mode 100644 agbenchmark/benchmark/challenges/retrieval/r1_test.py
 create mode 100644 agbenchmark/benchmark/challenges/utils.py
 create mode 100644 agbenchmark/benchmark/challenges/web_navigation/wn1_test.py
 create mode 100644 agbenchmark/benchmark/challenges/writing/w1_test.py
 create mode 100644 agbenchmark/benchmark/run.py
 create mode 100644 agbenchmark/server/__init__.py
 create mode 100644 agbenchmark/server/api.py
 create mode 100644 agbenchmark/server/utils.py
 create mode 100644 agbenchmark/workspace/__init__.py
 create mode 100644 agbenchmark/workspace/cloud_services/aws.py
 create mode 100644 agbenchmark/workspace/workspace_manager.py
 delete mode 100644 auto_gpt_benchmarking/AutoGPTAgent.py
 delete mode 100644 auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
 delete mode 100644 auto_gpt_benchmarking/CompletionFn.py
 delete mode 100644 auto_gpt_benchmarking/Evaluator.py
 delete mode 100644 auto_gpt_benchmarking/__init__.py
 delete mode 100644 auto_gpt_benchmarking/__main__.py
 delete mode 100644 auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
 delete mode 100644 evals_analytics.ipynb
 create mode 100644 poetry.lock
 create mode 100644 pyproject.toml
 delete mode 100644 requirements.txt
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_api.py
 create mode 100644 tests/test_benchmark.py
 create mode 100644 tests/test_workspace_manager.py

diff --git a/.gitignore b/.gitignore
index 04a0b6b0e..68bc17f9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,7 +20,6 @@ parts/
 sdist/
 var/
 wheels/
-pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
@@ -50,6 +49,7 @@ coverage.xml
 *.py,cover
 .hypothesis/
 .pytest_cache/
+cover/
 
 # Translations
 *.mo
@@ -72,6 +72,7 @@ instance/
 docs/_build/
 
 # PyBuilder
+.pybuilder/
 target/
 
 # Jupyter Notebook
@@ -82,7 +83,9 @@ profile_default/
 ipython_config.py
 
 # pyenv
-.python-version
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -91,7 +94,22 @@ ipython_config.py
 #   install all needed dependencies.
 #Pipfile.lock
 
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 
 # Celery stuff
@@ -128,6 +146,15 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
-/data
+# pytype static type analyzer
+.pytype/
 
-/.idea
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index e69de29bb..000000000
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..3445835be
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+  "[python]": {
+    "editor.defaultFormatter": "ms-python.black-formatter"
+  },
+  "python.formatting.provider": "none"
+}
diff --git a/LICENSE b/LICENSE
index 601935b85..696ff02ba 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Toran Bruce Richards
+Copyright (c) 2023 Silen Naihin
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index e84ff1af8..820c0f51e 100644
--- a/README.md
+++ b/README.md
@@ -1,120 +1,131 @@
-# Closing in favor of Challenges!
-Please check out challenges run in our CI pipeline: https://github.com/Significant-Gravitas/Auto-GPT/tree/master/tests/integration/challenges
+# agbenchmark
 
-# Auto-GPT-Benchmarks
-A set of standardised benchmarks to assess the performance of Auto-GPT.
-This currently uses the OpenAI Evals framework to run the benchmarks.
+A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
-## Setup
+#### MVP: function calls api, api returns presigned url, folder is uploaded, write file challenge is measured, score is given
 
-You must add the auto_gpt_benchmarking dir to the python path
-Do this with a path file in your venv. OpenAI evals needs to import it.
+#### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
 
-These instructions currently assume ubuntuy 22.04.
-They should be fairly adaptable to the windows/MacOS equivalents. Please submit a PR if you would like to see your OS
-documented.
+## Contributing
 
-Clone the repo with:
+- Make sure you have `poetry` installed - `pip install poetry`.
+- Then `poetry install` for dependencies
 
-    git clone git@github.com:Significant-Gravitas/Auto-GPT-Benchmarks.git
-    cd Auto-GPT-Benchmarks
+- To add requirements `poetry add requirement`.
+- To run in venv `poetry run python script.py`
 
-Create a venv with
+Feel free to merge with `main` at will (but also to ask for review) - if you can't send msg in R&D chat for access.
 
-    python3.10 -m venv venv
+If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit
 
+Let people know what beautiful code you write does, document everything well
 
-On MaxOS/Linux Activate it with 
+Share your progress :)
 
-    source venv/bin/activate
+## Api
 
-On Windows:
+FastAPI with REST, import requests
+
+```
+POST hostname:8080/challenges
+{
+   "test_name": ""
+   "challenge": "memory" - optional
+}
+```
 
-    venv/scripts/activate
+## Auth:
 
-Install the requirements with:
+get preSignedUrl from API
 
-    pip install -r requirements.txt
+```
+POST preSignedUrl
+{
+   "artifacts": [{}]
+}
+```
 
-If you haven't already clone the AutoGPT repo somewhere else on your machine.
-DO NOT CLONE IT INTO A SUBDIR OF THIS REPO.
+## Workspace
 
-    cd somewhere/else
-    git clone git@github.com:Significant-Gravitas/Auto-GPT.git
-    cd Auto-GPT
-    git checkout stable # Or the branch you want to benchmark
+Kubernetes with AWS3 or GCP
 
-You will need to update the .env file in the Auto-GPT repo to have your OpenAI api key. The file in question is at. This should becopied from the .env.template as described in the Auto-GPT README.md
+## Challenges
 
-    Auto-GPT/.env
+#### Dataset
 
-Finally, we assume you have a docker container built from the Dockerfile in the Auto-GPT repo.
+Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
 
-Build this with:
+#### Simple challenge creation through a DSL (domain specific language)
 
-    cd Auto-GPT
-    docker build -t autogpt .
+```
+Challenge TicTacToeCoding
+    Description "The agent should implement a basic tic-tac-toe game in Python."
+    Artifacts {
+        Code "tictactoe.py"
+    }
+    Tasks {
+        Code "Write a function to initialize the game board."
+        Code "Write a function to handle a player's turn."
+        Code "Write a function to check for a winning move."
+        Test "Write tests for the blog post model, serializer, and view."
+        Command "Run Django's test suite to ensure everything is working as expected."
+    }
+    SuccessCriteria {
+        Correctness "The game should correctly alternate between two players."
+        Correctness "The game should correctly identify a winning move."
+        Efficiency "The game should not use unnecessary computational resources."
+        Design "The solution should follow good practices for Django and Django Rest Framework."
+    }
+EndChallenge
+```
 
-Run your first eval with:
+#### Validators
 
-    cd Auto-GPT-Benchmarks
-    python3 auto_gpt_benchmarking test-match --auto-gpt-path /your/path/to/Auto-GPT
+Designed to handle specific types of output (e.g., text, code, structured data)
 
-You should only need to use the --auto-gpt-path flag the first time you run it. Afterwards, that will be saved in 
+#### Logging
 
-    auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml.
+Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc
 
-To see a full list of available flags you can use run `python3 -m auto_gpt_benchmarking --help`
-Some of these are inherited from the openAI evals framework and do not work quite as intended as they are not applicable
-to this use case.
+#### Written Challenges
 
-This saves a file in `Auto-GPT-Benchmarks/data/records.jsonl`
-This file is currently a default that is configurable with --record_path flag. You will have to specify the fully
-qualified path.
+For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore
 
-## Currently Supported Benchmarks:
-From OpenAI Evals
-- [x] test-match
-- [x] test-fuzzy-match
-- [ ] Everything else they have...
+## Repo
 
-## Understanding OpenAI Evals
-
-The Evals docs are here and very good: https://github.com/openai/evals/tree/main/docs
-
-The basic idea is this though:
-1. Use a completion function to point to the language model or in our case AutoGPT, the model you want to test.
-2. Register that completion function with the evals framework with a yaml in a `completion_fns` dir.
-3. Run the evals against the completion function.
-
-Then you can make more also, yaml defined evals and run them against the completion function as needed.
-
-### Completions Functions
-
-See our yaml file in `completion_fns` dir for the registration of the completion function.
-See our completion function itself in CompletionFn.py
-That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py
-
-
-# Example final output:
-
-/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl
-{"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}}
-{"final_report": {"accuracy": 0.3333333333333333}}
-{"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 1, "sample_id": "test-match.s1.2", "type": "match", "data": {"correct": false, "expected": "15", "picked": null, "sampled": "OpenAI was founded in 2015.2015", "options": ["15"]}, "created_by": "", "created_at": "2023-04-17 22:10:13.127550+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 2, "sample_id": "test-match.s1.1", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: The first US president was \nAssistant: ", "sampled": "George Washington"}, "created_by": "", "created_at": "2023-04-17 22:11:17.761693+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 3, "sample_id": "test-match.s1.1", "type": "match", "data": {"correct": true, "expected": "George Washington", "picked": "George Washington", "sampled": "George Washington", "options": ["George Washington"]}, "created_by": "", "created_at": "2023-04-17 22:11:17.761739+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 4, "sample_id": "test-match.s1.0", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: Once upon a \nAssistant: ", "sampled": "Once upon a time"}, "created_by": "", "created_at": "2023-04-17 22:12:04.691026+00:00"}
-{"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"}
-(venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ 
-
-# What is next?
-
-- [ ] Run the rest of the OpenAI Evals Especially the modelgraded ones
-- [ ] Build longer form tasks, (code fix backed by testing)
-- [ ] Explicitly note the common failure modes in the test harness and fix them. Most of these appear to be failure modes with the core AutoGPT project
-- [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used
-- [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework.
-- [ ] Figure our how the OpenAI Evals results are saved...
-- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't.
+```
+|-- agbenchmark/ **main project directory**
+| |-- **init**.py
+| |-- server/
+| | |-- **init**.py
+| | |-- api.py **opens server on host and exposes urls**
+| | |-- utils.py
+| |-- benchmark/
+| | |-- **init**.py
+| | |-- benchmark.py **combining scores, metrics, final evaluation**
+| | |-- run.py **entry point. sets everything up**
+| | |-- challenges/ **challenges across different metrics**
+| | | |-- **init**.py
+| | | |-- Challenge.py **easy challenge creation through Challenge class. potentially how DSL is defined. may need to inherit challenge class like Adaptability(Challenge)**
+| | | |-- utils.py
+| | | |-- adaptability.py
+| | | |-- basic_abilities.py
+| | | |-- code.py
+| | | |-- memory.py
+| | | |-- retrieval.py
+| | | |-- web_navigation.py
+| | | |-- writing.py
+| |-- workspace/ **workspace related func**
+| | |-- **init**.py
+| | |-- workspace_manager.py **creation, deletion, preSignedUrl generation**
+| | |-- cloud_services/
+| | | |-- **init**.py
+| | | |-- aws.py **not finalized, but write, read, and del files**
+|-- tests/ **test func of agbenchmark**
+| |-- **init**.py
+| |-- test_api.py
+| |-- test_benchmark.py
+| |-- test_workspace_manager.py
+```
+
+Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility
diff --git a/agbenchmark/__init__.py b/agbenchmark/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/__init__.py b/agbenchmark/benchmark/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/benchmark.py b/agbenchmark/benchmark/benchmark.py
new file mode 100644
index 000000000..2f8124272
--- /dev/null
+++ b/agbenchmark/benchmark/benchmark.py
@@ -0,0 +1 @@
+# how well the agent did on the challenges, the metrics calculation
diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/__init__.py b/agbenchmark/benchmark/challenges/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/adaptability/a1_test.py b/agbenchmark/benchmark/challenges/adaptability/a1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py b/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py b/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/code/c1_test.py b/agbenchmark/benchmark/challenges/code/c1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/memory/m1_test.py b/agbenchmark/benchmark/challenges/memory/m1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/utils.py b/agbenchmark/benchmark/challenges/utils.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/challenges/writing/w1_test.py b/agbenchmark/benchmark/challenges/writing/w1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/benchmark/run.py b/agbenchmark/benchmark/run.py
new file mode 100644
index 000000000..b07ac6b55
--- /dev/null
+++ b/agbenchmark/benchmark/run.py
@@ -0,0 +1 @@
+# running all of the different challenges
diff --git a/agbenchmark/server/__init__.py b/agbenchmark/server/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/server/api.py b/agbenchmark/server/api.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/server/utils.py b/agbenchmark/server/utils.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/workspace/__init__.py b/agbenchmark/workspace/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/workspace/cloud_services/aws.py b/agbenchmark/workspace/cloud_services/aws.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/workspace/workspace_manager.py b/agbenchmark/workspace/workspace_manager.py
new file mode 100644
index 000000000..cfcf3f7ac
--- /dev/null
+++ b/agbenchmark/workspace/workspace_manager.py
@@ -0,0 +1 @@
+# Manages the workspaces including creation, deletion, etc
diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py
deleted file mode 100644
index 7a60009d9..000000000
--- a/auto_gpt_benchmarking/AutoGPTAgent.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""
-This instantiates an AutoGPT agent who is capable of handling any task.
-It is designed to pass benchmarks as effectively as possible.
-
-Loads in the ai_settings.yaml file to get the AI's name, role, and goals.
-Sets the ai to continuous mode, but kills it if it takes more than 50,000 tokens on any particular evaluation.
-
-The model is instantiated with a prompt from the AutoGPT completion function.
-
-Eventualy we will also save and log all of the associated output and thinking for the model as well
-"""
-from pathlib import Path
-import docker
-import asyncio
-import aiodocker
-
-
-class AutoGPTAgent:
-    """
-    A class object that contains the configuration information for the AI
-    The init function takes an evaluation prompt.
-    It copies the ai_settings.yaml file in AutoGPTData to the Auto-GPT repo.
-    It then copies the given prompt to a text file to Auto-GPT/auto_gpt_workspace called prompt.txt
-    It then polls the token usage of the model and for a file called output.txt in the Auto-GPT/auto_gpt_workspace folder.
-    If the model has used more than 50,000 tokens, it kills the model.
-    If the model has used less than 50,000 tokens, it returns the output.txt file.
-    """
-    def _clean_up_workspace(self):
-        """
-        Cleans up the workspace by deleting the prompt.txt and output.txt files.
-        :return:
-        """
-        # check if the files are there and delete them if they are
-        if self.prompt_file.exists():
-            self.prompt_file.unlink()
-        if self.output_file.exists():
-            self.output_file.unlink()
-        if self.file_logger.exists():
-            self.file_logger.unlink()
-
-    def _copy_ai_settings(self) -> None:
-        self.ai_settings_dest.write_text(self.ai_settings_file.read_text())
-
-    def _copy_prompt(self) -> None:
-        self.prompt_file.write_text(self.prompt)
-
-    async def _stream_logs(self, container: aiodocker.containers.DockerContainer) -> None:
-        try:
-            async for line in container.log(stdout=True, stderr=True, follow=True, tail="all"):
-                print(line.strip())
-                await asyncio.sleep(1)
-        except aiodocker.exceptions.DockerError as e:
-            # Handle Docker errors (e.g., container is killed or removed)
-            print('Docker error: {}'.format(e))
-
-    async def _run_stream_logs(self) -> None:
-        """
-        This grabs the docker containers id and streams the logs to the console with aiodocker.
-        :return: None
-        """
-        async with aiodocker.Docker() as docker_client:
-            try:
-                container = docker_client.containers.container(self.container.id)
-                await self._stream_logs(container)
-            except aiodocker.exceptions.DockerError as e:
-                # Handle cases when the container is not found
-                print('Container not found: {}'.format(e))
-
-    def _start_agent(self):
-        """
-        This starts the agent in the docker container.
-        This assumes you have the docker image built with:
-        docker build -t autogpt .
-        In the dockerfile in the Auto-GPT repo.
-        You also must set up the .env file in the Auto-GPT repo.
-        :return:
-        """
-        client = docker.from_env()
-        env_file = self.auto_gpt_path / ".env"
-        envs = [
-            f"{line.strip()}" for line in open(
-                env_file
-            ) if line.strip() != "" and line.strip()[0] != "#" and line.strip()[0] != "\n" and "=" in line and not line.startswith('SMART_LLM_MODEL')]
-
-        envs.append("SMART_LLM_MODEL=gpt-3.5-turbo")
-
-        self.container = client.containers.run(
-            image="autogpt",
-            command="--continuous -C '/app/auto_gpt_workspace/ai_settings.yaml' --skip-news",
-            environment=envs,
-            volumes={
-                self.auto_workspace: {"bind": "/app/auto_gpt_workspace", "mode": "rw"},
-                f"{self.auto_gpt_path}/autogpt": {"bind": "/app/autogpt", "mode": "rw"},
-            },
-            stdin_open=True,
-            tty=True,
-            detach=True
-        )
-        asyncio.run(self._run_stream_logs())
-
-    def _poll_for_output(self):
-        """
-        This polls the output file to see if the model has finished.
-        :return:
-        """
-        while True:
-            if self.output_file.exists():
-                print("Output file exists")
-                return self.output_file.read_text()
-
-    def __init__(self, prompt, auto_gpt_path: str):
-        self.auto_gpt_path = Path(auto_gpt_path)
-        self.auto_workspace = self.auto_gpt_path / "autogpt" / "auto_gpt_workspace"
-        # if the workspace doesn't exist, create it
-        if not self.auto_workspace.exists():
-            self.auto_workspace.mkdir()
-        self.prompt_file = self.auto_workspace / "prompt.txt"
-        self.output_file = self.auto_workspace / "output.txt"
-        self.file_logger = self.auto_workspace / "file_logger.txt"
-        self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml"
-        self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml"
-        self.prompt = prompt
-        self._clean_up_workspace()
-        self._copy_ai_settings()
-        self._copy_prompt()
-        self.container = None
-        self.killing = False
-        self.logging_task = None
-
-    def start(self):
-        self._start_agent()
-        answer = self._poll_for_output()
-        print(f"Prompt was: {self.prompt}, Answer was: {answer}")
-        self.kill()
-        return answer
-
-    def kill(self):
-        if self.killing:
-            return
-        self.killing = True
-        self._clean_up_workspace()
-        if self.container:
-            # kill the container
-            try:
-                self.container.kill()
-                self.container.remove()
-            except docker.errors.APIError:
-                print('Couldn\'t find container to kill. Assuming container successfully killed itself.')
-            if self.logging_task:
-                self.logging_task.cancel()
-        self.killing = False
-
-
-
-
diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
deleted file mode 100644
index ab6caaed0..000000000
--- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-ai_goals:
-- Evaluate the prompt in `prompt.txt` and find the best answer in the format provided.
-- Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer.
-- Save the final answer and output to the `output.txt` file, the only file you should write to, then immediately exit the program because you are done.
-ai_name: EvaluationAgent
-ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible
diff --git a/auto_gpt_benchmarking/CompletionFn.py b/auto_gpt_benchmarking/CompletionFn.py
deleted file mode 100644
index f82ede85c..000000000
--- a/auto_gpt_benchmarking/CompletionFn.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from evals.api import CompletionFn, CompletionResult
-
-from evals.prompt.base import CompletionPrompt
-from evals.record import record_sampling
-from auto_gpt_benchmarking.AutoGPTAgent import AutoGPTAgent
-
-
-class AutoGPTCompletionResult(CompletionResult):
-    def __init__(self, response) -> None:
-        self.response = response
-
-    def get_completions(self) -> list[str]:
-        return [self.response.strip()]
-
-
-class AutoGPTCompletionFn(CompletionFn):
-
-    def __init__(self, auto_gpt_path, **kwargs) -> None:
-        self.auto_gpt_path = auto_gpt_path
-        self.agent = None
-
-    def __call__(self, prompt, **kwargs) -> AutoGPTCompletionResult:
-        prompt = CompletionPrompt(prompt).to_formatted_prompt()
-        self.kill_agent()
-        self.agent = AutoGPTAgent(prompt, self.auto_gpt_path)
-        response = self.agent.start()
-        record_sampling(prompt=prompt, sampled=response)
-        return AutoGPTCompletionResult(response)
-
-    def kill_agent(self):
-        if self.agent:
-            self.agent.kill()
-
-
diff --git a/auto_gpt_benchmarking/Evaluator.py b/auto_gpt_benchmarking/Evaluator.py
deleted file mode 100644
index 4301fb3bc..000000000
--- a/auto_gpt_benchmarking/Evaluator.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""
-The evaluator class actually executes the evals.
-"""
-from evals.cli import oaieval
-from evals.registry import Registry
-from pathlib import Path
-from typing import List, Optional, Tuple
-import sys
-
-
-class OAIRunArgs:
-    def __init__(
-        self,
-        completion_fn: str,
-        eval: str,
-        extra_eval_params: str = "",
-        max_samples: int = None,
-        cache: bool = True,
-        visible: bool = None,
-        seed: int = 20220722,
-        user: str = "",
-        record_path: str = None,
-        log_to_file: str = None,
-        debug: bool = False,
-        local_run: bool = True,
-        dry_run: bool = False,
-        dry_run_logging: bool = True,
-    ):
-        self.completion_fn = completion_fn
-        self.eval = eval
-        self.extra_eval_params = extra_eval_params
-        self.max_samples = max_samples
-        self.cache = cache
-        self.visible = visible
-        self.seed = seed
-        self.user = user
-        self.record_path = record_path
-        self.log_to_file = log_to_file
-        self.debug = debug
-        self.local_run = local_run
-        self.dry_run = dry_run
-        self.dry_run_logging = dry_run_logging
-        # create the record and logging paths if they don't exist
-        Path(self.record_path).parent.mkdir(parents=True, exist_ok=True)
-        # Path(self.log_to_file).parent.mkdir(parents=True, exist_ok=True)
-        # Registry path should be the auto_gpt_benchmarking folder
-        self.registry_path = None
-
-
-class Evaluator:
-    def __init__(self, oai_run_args: OAIRunArgs):
-        self.oai_run_args = oai_run_args
-        registry_path = Path(__file__).parent
-
-        # add registry path to the python system path
-        sys.path.append(str(registry_path))
-        self.oai_run_args.registry_path = [registry_path]
-        # self.registry = Registry([registry_path])
-
-    def run(self):
-        oaieval.run(self.oai_run_args)
diff --git a/auto_gpt_benchmarking/__init__.py b/auto_gpt_benchmarking/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/auto_gpt_benchmarking/__main__.py b/auto_gpt_benchmarking/__main__.py
deleted file mode 100644
index 84761a65d..000000000
--- a/auto_gpt_benchmarking/__main__.py
+++ /dev/null
@@ -1,158 +0,0 @@
-"""
-This is the main evaluation file. In it you can specify the following:
-
-1. The number of threads to use for evaluation. This is set to 1 by default.And will remain that way until we can spin
- up containers on command
-2. The timeout for each thread. This is set to 60 seconds by default. This is the amount of time each thread will run
- for before it is killed when evaluating an agent
-3. The path to the AutoGPT code. This is a required parameter as we do not know where your code lives.
-4. The evals you would like to run. The options here are any OpenAI eval, or any of the evals defined in this repository
-
-
-What this file does is it parses the params given and then runs the evals with OpenAI's evals framework.
-"""
-
-import argparse
-import os
-import sys
-from pathlib import Path
-from datetime import datetime
-import yaml
-from datetime import datetime
-
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "eval", type=str, help="Name of an eval. See registry.")
-    parser.add_argument(
-        "--completion-fn",
-        type=str,
-        dest="completion_fn",
-        default="auto_gpt_completion_fn",
-        help="One or more CompletionFn URLs, separated by commas (,). "
-             "A CompletionFn can either be the name of a model available in the OpenAI API or a key in the registry "
-             "(see evals/registry/completion_fns).",
-    )
-    parser.add_argument(
-        "--timeout",
-        type=int,
-        default=300,
-        help="The timeout for each thread",
-    )
-    parser.add_argument(
-        "--auto-gpt-path",
-        type=str,
-        default=None,
-        help="The path to the AutoGPT code. This updates auto_gpt_competion_fn.yaml in completion fns. "
-             "So you only need to set this once.",
-    )
-    parser.add_argument("--extra_eval_params", type=str, default="")
-    parser.add_argument("--max_samples", type=int, default=None)
-    parser.add_argument(
-        "--cache", action=argparse.BooleanOptionalAction, default=True)
-    parser.add_argument(
-        "--visible", action=argparse.BooleanOptionalAction, default=None)
-    parser.add_argument("--seed", type=int, default=20220722)
-    parser.add_argument("--user", type=str, default="")
-    parser.add_argument("--record_path", type=str, default=None)
-    parser.add_argument(
-        "--log_to_file", type=str, default=None,  # default=str(
-        #   Path(__file__).parent.parent / "data" / "log" / "log.txt"
-        #  ), help="Log to a file instead of stdout"
-    )
-    parser.add_argument(
-        "--debug", action=argparse.BooleanOptionalAction, default=False)
-    parser.add_argument(
-        "--local-run", action=argparse.BooleanOptionalAction, default=True)
-    parser.add_argument(
-        "--dry-run", action=argparse.BooleanOptionalAction, default=False)
-    parser.add_argument("--dry-run-logging",
-                        action=argparse.BooleanOptionalAction, default=True)
-    return parser.parse_args()
-
-
-def update_yaml_with_auto_gpt_path(yaml_path: str, auto_gpt_path: str or None) -> Path:
-    """
-    If there is a given auto_gpt_path, then we need to update the yaml file to include it in the system path
-    If we don't have one. Then we get the path from the yaml.
-    If none exists in the yaml and we don't have a path then we raise an exception.
-    :param yaml_path: The path to the yaml file
-    :param auto_gpt_path: The path to the AutoGPT code
-    :return: The path to the AutoGPT code
-    """
-    with open(yaml_path, "r") as f:
-        yaml_data = yaml.safe_load(f)
-    if yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] is None and auto_gpt_path is None:
-        raise Exception(
-            "You must specify a auto_gpt_path in the yaml file or pass it in as a parameter")
-    if auto_gpt_path is None:
-        auto_gpt_path = yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"]
-    if auto_gpt_path is not None:
-        yaml_data["auto_gpt_completion_fn"]["args"]["auto_gpt_path"] = auto_gpt_path
-    with open(yaml_path, "w") as f:
-        yaml.safe_dump(yaml_data, f)
-
-    return Path(auto_gpt_path).absolute()
-
-
-def load_env_file(env_path: Path):
-    if not env_path.exists():
-        raise FileNotFoundError('You must set the OpenAI key in the AutoGPT env file. '
-                                'We need your api keys to start the AutoGPT agent and use OpenAI evals')
-    with open(env_path, "r") as f:
-        # find the OPENAI_API_KEY key split it from the equals sign and assign it so OpenAI evals can use it.
-        for line in f.readlines():
-            if line.startswith("OPENAI_API_KEY"):
-                os.environ["OPENAI_API_KEY"] = line.split("=")[1].strip()
-                break
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    # do not run in multiprocessing mode We do not use this right now, as it disables OpenAI's timeouts :(
-    # os.environ["EVALS_SEQUENTIAL"] = "1"
-    os.environ["EVALS_THREAD_TIMEOUT"] = str(args.timeout)
-    os.environ["EVALS_THREADS"] = str(1)
-
-    # Update the yaml file with the auto_gpt_path
-    autogpt_path = update_yaml_with_auto_gpt_path(
-        str(Path(__file__).parent / "completion_fns" /
-            "auto_gpt_completion_fn.yaml"),
-        args.auto_gpt_path
-    )
-
-    # Add the benchmarks path to the system path so we can import auto_gpt_benchmarking
-    sys.path.append(str(Path(__file__).parent.parent.absolute()))
-
-    # load all of the environment variables in the auto-gpt path/.env file
-    load_env_file(Path(autogpt_path) / ".env")
-
-    # Obviously, a top level import would be better. This allows us to set the API key with the env file, as it gets
-    # set in the evaluator. We can't set it before the import because the import will fail without an API key.
-    from auto_gpt_benchmarking.Evaluator import Evaluator, OAIRunArgs
-    if args.record_path is None:
-        args.record_path = str(Path(
-            __file__).parent.parent / "data" / f"eval-{args.eval}-{datetime.now().strftime('%Y%m%d-%H%M%S')}.jsonl")
-    run_args = OAIRunArgs(
-        completion_fn=args.completion_fn,
-        eval=args.eval,
-        extra_eval_params=args.extra_eval_params,
-        max_samples=args.max_samples,
-        cache=args.cache,
-        visible=args.visible,
-        seed=args.seed,
-        user=args.user,
-        record_path=args.record_path,
-        log_to_file=args.log_to_file,
-        debug=args.debug,
-        local_run=args.local_run,
-        dry_run=args.dry_run,
-        dry_run_logging=args.dry_run_logging)
-
-    # Run the evals
-    evaluator = Evaluator(
-        run_args
-    )
-    evaluator.run()
diff --git a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml b/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
deleted file mode 100644
index a101f000a..000000000
--- a/auto_gpt_benchmarking/completion_fns/auto_gpt_completion_fn.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-auto_gpt_completion_fn:
-  args:
-    auto_gpt_path:
-  class: auto_gpt_benchmarking.CompletionFn:AutoGPTCompletionFn
diff --git a/evals_analytics.ipynb b/evals_analytics.ipynb
deleted file mode 100644
index f1b48424c..000000000
--- a/evals_analytics.ipynb
+++ /dev/null
@@ -1,220 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAc5klEQVR4nO3deZgddZ3v8feHALLKYoLGrCiIoBDABlRQQBEiKsHHLVEUvGDmOuK43RnBmQcYnHH0ehVHwYHoRERlEREmwwQRRhEVkTSILEEkRiAJYBrCpiAY+Nw/6tfDSVOdPunu6pNOf17Pc56u+v1q+Vaf5Hy6llMl20RERPS1UacLiIiI9VMCIiIiaiUgIiKiVgIiIiJqJSAiIqJWAiIiImolICI6TNJrJN3e6Toi+kpAREdJukrSg5Ke0+lamiDpIEnLa9qvknQcgO2f2t6ljWWdIunbTdQZUScBER0jaTrwGsDAESO87o1Hcn3ru/w+ok4CIjrpfcC1wNnA0a0dkqZI+r6kHkkPSDq9pe8Dkm6T9KikxZL2Lu2WtFPLdGdL+qcyfJCk5ZI+Kek+4BuStpN0aVnHg2V4csv820v6hqR7Sv8lpf0WSW9pmW4TSfdL2mswv4S+exmlxhVl+26X9HpJM4FPAe+S9EdJvy7TvlDSAkmrJC2R9IGW5Wwu6Zul9tsk/V2f9dxZ1nUT8CdJG0s6QdLvWn63b22Z/hhJP5d0mqSHJC2V9OrSvkzSSklrvI8xuiUgopPeB3ynvA6T9HwASeOAS4G7gOnAJOD80vcO4JQy73Op9jweaHN9LwC2B6YBc6n+/X+jjE8FHgdOb5n+W8AWwMuAHYDTSvs5wFEt0x0O3Gv7V23W0S9JuwDHA/vY3ho4DLjT9g+AzwAX2N7K9owyy/nAcuCFwNuBz0h6Xek7mer39yLgDX1q7jUHeBOwre3VwO+o9uq2Af4R+LakiS3T7wfcBDwPOLesfx9gp7L80yVtNdTfQ6wnbOeV14i/gAOAvwDjy/hvgI+V4VcBPcDGNfNdDnykn2Ua2Kll/Gzgn8rwQcCTwGZrqWlP4MEyPBF4GtiuZroXAo8Czy3j3wP+rp9lHlSW81Cf12rguJZplpfhnYCVwCHAJn2WdQrw7ZbxKcBTwNYtbf8CnF2GlwKHtfQd17ueMn4n8L8GeJ9uBGaV4WOAO1r6di+/8+e3tD0A7Nnpf195Dc8rexDRKUcDP7R9fxk/l2cOM00B7nL1F21fU6j+yh2MHtt/7h2RtIWksyTdJekR4Gpg27IHMwVYZfvBvguxfQ/wc+BtkrYF3ki1F9Sfe2xv2/oCflY3oe0lwEepwmClpPMlvbCf5b6w1PhoS9tdVHtcvf3LWvpah2vbJL1P0o3lENJDwMuB8S2T/KFl+PFSc9+27EFsIBIQMeIkbQ68EzhQ0n3lnMDHgBmSZlB9aE3t58TpMuDF/Sz6MapDQr1e0Ke/762LPwHsAuxn+7nAa3tLLOvZvgRAnW9SHVJ5B/AL2yv6mW6d2T7X9gFUh74MfK6f+u8pNW7d0jYV6K3lXmByS9+UutX1DkiaBnyN6hDX80qQ3UL1+4gxKAERnXAk1aGR3agO6+wJ7Ar8lOrcwnVUH26flbSlpM0k7V/m/TrwfyS9QpWdygcbVIdD3i1pXDmpe+AAdWxN9RfvQ5K2pzpmD4Dte4HLgK+Wk9mbSHpty7yXAHsDH6E6JzEsJO0i6XWqLvv9c6nv6dL9B2C6pI1KjcuAa4B/Kb+jPYBjgd5LYb8LnFjqn0T1wb82W1IFRk+p5f1UexAxRiUgohOOBr5h+27b9/W+qE4Qv4fqL9a3UB2Pv5vqJOy7AGxfCPwz1SGpR6k+qLcvy/1Ime+hspxLBqjjS8DmwP1UV1P9oE//e6nOk/yG6rzAR3s7bD8OXATsCHy/7S0f2HOAz5aa7qM6OX5i6buw/HxA0g1leA7Vieh7gIuBk21fWfpOpfrd/R64kupcyRP9rdj2YuALwC+owmh3qkNpMUbJzgODIgZD0knAS2zXXR203pH0QWC27YH2rCKA7EFEDEo5JHUsMK/TtfRH0kRJ+0vaqFw++wmqvYyItiQgItZR+TLaMuAy21d3up612BQ4i+pQ3I+A/wC+2tGKYlTJIaaIiKiVPYiIiKi1Qd2ga/z48Z4+fXqny4iIGDWuv/76+21PqOvboAJi+vTpdHd3d7qMiIhRQ9Jd/fXlEFNERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStxgJC1TOFf1yea3urpI/UTCNJXy7P0r1J5dnCpe9oSXeUV55zGxExwpr8HsRq4BO2bygPNLle0hXllsK93gjsXF77Af8G7Ndyb/4uqvvTXy9pQd3TvSIiohmN7UHYvtf2DWX4UeA2nnkUYq9ZwDmuXEv1uMeJVA9qv8J27yMfrwBmNlVrREQ824icg5A0HdgL+GWfrkms+Uzc5aWtv/a6Zc+V1C2pu6enZ9hqjohYn02cPBVJSGLi5KmNrKPxgJC0FdWTtz5q+5HhXr7teba7bHdNmFB7O5GIiA3OfSuWMe2TlzLtk5dy34plA88wCI0GhKRNqMLhO7brHsu4gjUfpD65tPXXHhERI6TJq5gE/Dtwm+0v9jPZAuB95WqmVwIPl4fFXw4cWh62vh1waGmLiIgR0uRVTPtTPfT9Zkk3lrZPAVMBbJ8JLAQOB5YAjwHvL32rJH0aWFTmO9X2qgZrjYiIPhoLCNs/AzTANAY+1E/ffGB+A6VFREQb8k3qiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNPTBI0nzgzcBK2y+v6f9b4D0tdewKTChPk7sTeBR4Clhtu6upOiMiol6TexBnAzP767T9edt72t4TOBH4SZ/Hih5c+hMOEREd0FhA2L4aaPc50nOA85qqJSIi1l3Hz0FI2oJqT+OilmYDP5R0vaS5naksImJsa+wcxDp4C/DzPoeXDrC9QtIOwBWSflP2SJ6lBMhcgKlTpzZfbUTEGNHxPQhgNn0OL9leUX6uBC4G9u1vZtvzbHfZ7powYUKjhUZEjCUdDQhJ2wAHAv/R0ralpK17h4FDgVs6U2FExNjV5GWu5wEHAeMlLQdOBjYBsH1mmeytwA9t/6ll1ucDF0vqre9c2z9oqs6IiKjXWEDYntPGNGdTXQ7b2rYUmNFMVRER0a714RxERESshxIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUaCwhJ8yWtlFT7PGlJB0l6WNKN5XVSS99MSbdLWiLphKZqjIiI/jW5B3E2MHOAaX5qe8/yOhVA0jjgDOCNwG7AHEm7NVhnRETUaCwgbF8NrBrErPsCS2wvtf0kcD4wa1iLi4iIAXX6HMSrJP1a0mWSXlbaJgHLWqZZXtpqSZorqVtSd09PT5O1RkSMKZ0MiBuAabZnAF8BLhnMQmzPs91lu2vChAnDWV9ExJjWsYCw/YjtP5bhhcAmksYDK4ApLZNOLm0RETGCOhYQkl4gSWV431LLA8AiYGdJO0raFJgNLOhUnRERY9XGTS1Y0nnAQcB4ScuBk4FNAGyfCbwd+KCk1cDjwGzbBlZLOh64HBgHzLd9a1N1RkREvcYCwvacAfpPB07vp28hsLCJuiIioj2dvoopIiLWUwmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFqNBYSk+ZJWSrqln/73SLpJ0s2SrpE0o6XvztJ+o6TupmqMiIj+tRUQknYfxLLPBmaupf/3wIG2dwc+Dczr03+w7T1tdw1i3RERMUTt7kF8VdJ1kv5a0jbtzGD7amDVWvqvsf1gGb0WmNxmLRERMQLaCgjbrwHeA0wBrpd0rqQ3DGMdxwKXta4S+KGk6yXNXduMkuZK6pbU3dPTM4wlRUSMbRu3O6HtOyT9A9ANfBnYS5KAT9n+/mALkHQwVUAc0NJ8gO0VknYArpD0m7JHUlfXPMrhqa6uLg+2joiIWFO75yD2kHQacBvwOuAttnctw6cNduWS9gC+Dsyy/UBvu+0V5edK4GJg38GuIyIiBqfdcxBfAW4AZtj+kO0bAGzfA/zDYFYsaSrwfeC9tn/b0r6lpK17h4FDgdoroSIiojntHmJ6E/C47acAJG0EbGb7MdvfqptB0nnAQcB4ScuBk4FNAGyfCZwEPI/qBDjA6nLF0vOBi0vbxsC5tn8wuM2LiIjBajcgrgQOAf5YxrcAfgi8ur8ZbM9Z2wJtHwccV9O+FJjx7DkiImIktXuIaTPbveFAGd6imZIiImJ90G5A/EnS3r0jkl4BPN5MSRERsT5o9xDTR4ELJd0DCHgB8K6mioqIiM5rKyBsL5L0UmCX0nS77b80V1ZERHRa21+UA/YBppd59paE7XMaqSoiIjqurYCQ9C3gxcCNwFOl2UACIiJiA9XuHkQXsJvt3MoiImKMaPcqpluoTkxHRMQY0e4exHhgsaTrgCd6G20f0UhVERHRce0GxClNFhEREeufdi9z/YmkacDOtq+UtAUwrtnSIiKik9q93fcHgO8BZ5WmScAlDdUUERHrgXZPUn8I2B94BKqHBwE7NFVURER0XrsB8YTtJ3tHJG1M9T2IiIjYQLUbED+R9Clg8/Is6guB/2yurIiI6LR2A+IEoAe4GfgrYCGDfJJcRESMDu1exfQ08LXyioiIMaDdq5h+L2lp31cb882XtFJS7TOlVfmypCWSburzzImjJd1RXke3v0kRETEc1uVeTL02A94BbN/GfGcDp9P/Tf3eCOxcXvsB/wbsJ2l7qmdYd1GdDL9e0gLbD7ZZb0REDFFbexC2H2h5rbD9JeBNbcx3NbBqLZPMAs5x5VpgW0kTgcOAK2yvKqFwBTCznVojImJ4tHuIae+WV5ek/826PUuiP5OAZS3jy0tbf+11tc2V1C2pu6enZxhKinZMnDwVSUhi4uSpo275se7ynow97X7If6FleDVwJ/DOYa9mEGzPA+YBdHV15bsZI+S+FcuY9slLAbjrc28edcuPdZf3ZOxp9yqmgxta/wpgSsv45NK2AjioT/tVDdUQERE12n2i3MfX1m/7i4Nc/wLgeEnnU52kftj2vZIuBz4jabsy3aHAiYNcR0REDMK6XMW0D9UHOsBbgOuAO9Y2k6TzqPYExktaTnVl0iYAts+k+sLd4cAS4DHg/aVvlaRPA4vKok61vbaT3RERMczaDYjJwN62HwWQdArwX7aPWttMtucM0G+qGwHW9c0H5rdZX0REDLN2b7XxfODJlvEnS1tERGyg2t2DOAe4TtLFZfxI4JuNVBQREeuFdq9i+mdJlwGvKU3vt/2r5sqKiIhOa/cQE8AWwCO2/xVYLmnHhmqKiIj1QLvfpD4Z+CTPXGq6CfDtpoqKiIjOa3cP4q3AEcCfAGzfA2zdVFEREdF57QbEk+WSVANI2rK5kiIiYn3QbkB8V9JZVHdb/QBwJXl4UETEBm3Aq5gkCbgAeCnwCLALcJLtKxquLSIiOmjAgLBtSQtt7071XIaIiBgD2j3EdIOkfRqtJCIi1ivtfpN6P+AoSXdSXckkqp2LPZoqLCIiOmutASFpqu27qR4BGhERY8hAexCXUN3F9S5JF9l+2wjUFBER64GBzkGoZfhFTRYSERHrl4ECwv0MR0TEBm6gQ0wzJD1CtSexeRmGZ05SP7fR6iIiomPWGhC2xw1l4ZJmAv8KjAO+bvuzffpPAw4uo1sAO9jetvQ9Bdxc+u62fcRQaomIiHXT7mWu60zSOOAM4A3AcmCRpAW2F/dOY/tjLdN/GNirZRGP296zqfoiImLt1uV5EOtqX2CJ7aW2nwTOB2atZfo5wHkN1hMREeugyYCYBCxrGV9e2p5F0jRgR+BHLc2bSeqWdK2kI/tbiaS5Zbrunp6eYSg7IiKg2YBYF7OB79l+qqVtmu0u4N3AlyS9uG5G2/Nsd9numjBhwkjUGhExJjQZECuAKS3jk0tbndn0Obxke0X5uRS4ijXPT0RERMOaDIhFwM6SdpS0KVUILOg7kaSXAtsBv2hp207Sc8rweGB/YHHfeSMiojmNXcVke7Wk44HLqS5znW/7VkmnAt22e8NiNnB+eWJdr12BsyQ9TRVin229+ikiIprXWEAA2F4ILOzTdlKf8VNq5rsG2L3J2iIiYu3Wl5PUERGxnklARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRKQERERK0ERERE1EpARERErQRERETUSkBEREStBERERNRqNCAkzZR0u6Qlkk6o6T9GUo+kG8vruJa+oyXdUV5HN1lnREQ8W2OPHJU0DjgDeAOwHFgkaUHNs6UvsH18n3m3B04GugAD15d5H2yq3oiIWFOTexD7AktsL7X9JHA+MKvNeQ8DrrC9qoTCFcDMhuqMiIgaTQbEJGBZy/jy0tbX2yTdJOl7kqas47xImiupW1J3T0/PcNQdERF0/iT1fwLTbe9BtZfwzXVdgO15trtsd02YMGHYC4yIGKuaDIgVwJSW8cml7X/YfsD2E2X068Ar2p03IiKa1WRALAJ2lrSjpE2B2cCC1gkkTWwZPQK4rQxfDhwqaTtJ2wGHlraIiBghjV3FZHu1pOOpPtjHAfNt3yrpVKDb9gLgbyQdAawGVgHHlHlXSfo0VcgAnGp7VVO1RkTEszUWEAC2FwIL+7Sd1DJ8InBiP/POB+Y3WV9ERPSv0yepIyJiPZWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKiVgIiIiFoJiIiIqJWAiIiIWgmIiIiolYCIiIhaCYiIiKjVaEBIminpdklLJJ1Q0/9xSYsl3STpvyVNa+l7StKN5bWg77wREdGsxh45KmkccAbwBmA5sEjSAtuLWyb7FdBl+zFJHwT+L/Cu0ve47T2bqi8iItauyT2IfYEltpfafhI4H5jVOoHtH9t+rIxeC0xusJ6IiFgHTQbEJGBZy/jy0tafY4HLWsY3k9Qt6VpJR/Y3k6S5Zbrunp6eIRUcERHPaOwQ07qQdBTQBRzY0jzN9gpJLwJ+JOlm27/rO6/tecA8gK6uLo9IwRERY0CTexArgCkt45NL2xokHQL8PXCE7Sd6222vKD+XAlcBezVYa0RE9NFkQCwCdpa0o6RNgdnAGlcjSdoLOIsqHFa2tG8n6TlleDywP9B6cjsiIhrW2CEm26slHQ9cDowD5tu+VdKpQLftBcDnga2ACyUB3G37CGBX4CxJT1OF2Gf7XP0UERENa/QchO2FwMI+bSe1DB/Sz3zXALs3WVtERKxdvkkdERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRIQERFRKwERERG1EhAREVErAREREbUSEBERUSsBERERtRoNCEkzJd0uaYmkE2r6nyPpgtL/S0nTW/pOLO23SzqsyTojIuLZGgsISeOAM4A3ArsBcyTt1meyY4EHbe8EnAZ8rsy7GzAbeBkwE/hqWV5ERIyQJvcg9gWW2F5q+0ngfGBWn2lmAd8sw98DXi9Jpf1820/Y/j2wpCwvIiJGyMYNLnsSsKxlfDmwX3/T2F4t6WHgeaX92j7zTqpbiaS5wNwy+kdJtw+y3vHA/YOcd7Qa0jbf9bk3/89wlevDq6Hlj7X3eVi3t+n3fJiMmfe45f0YL2mw2zytv44mA2JE2J4HzBvqciR12+4ahpJGjWzzhm+sbS9km4dTk4eYVgBTWsYnl7baaSRtDGwDPNDmvBER0aAmA2IRsLOkHSVtSnXSeUGfaRYAR5fhtwM/su3SPrtc5bQjsDNwXYO1RkREH40dYirnFI4HLgfGAfNt3yrpVKDb9gLg34FvSVoCrKIKEcp03wUWA6uBD9l+qqlaiyEfphqFss0bvrG2vZBtHjaq/mCPiIhYU75JHRERtRIQERFRa8wFxFBu/zEatbG9H5e0WNJNkv5bUr/XRI8WA21zy3Rvk2RJo/6SyHa2WdI7y3t9q6RzR7rG4dbGv+2pkn4s6Vfl3/fhnahzuEiaL2mlpFv66ZekL5ffx02S9h7ySm2PmRfVyfLfAS8CNgV+DezWZ5q/Bs4sw7OBCzpdd8PbezCwRRn+4Gje3na3uUy3NXA11Rcyuzpd9wi8zzsDvwK2K+M7dLruEdjmecAHy/BuwJ2drnuI2/xaYG/gln76DwcuAwS8EvjlUNc51vYghnL7j9FowO21/WPbj5XRa6m+czKatfMeA3ya6t5ffx7J4hrSzjZ/ADjD9oMAtleOcI3DrZ1tNvDcMrwNcM8I1jfsbF9NdbVnf2YB57hyLbCtpIlDWedYC4i623/0vYXHGrf/AHpv/zEatbO9rY6l+gtkNBtwm8uu9xTb/zWShTWonff5JcBLJP1c0rWSZo5Ydc1oZ5tPAY6StBxYCHx4ZErrmHX9/z6gUX+rjRgeko4CuoADO11LkyRtBHwROKbDpYy0jakOMx1EtZd4taTdbT/UyaIaNgc42/YXJL2K6jtXL7f9dKcLGy3G2h7EUG7/MRq1dcsSSYcAfw8cYfuJEaqtKQNt89bAy4GrJN1Jdax2wSg/Ud3O+7wcWGD7L67ukPxbqsAYrdrZ5mOB7wLY/gWwGdWN/DZUw36LorEWEEO5/cdoNOD2StoLOIsqHEb7cWkYYJttP2x7vO3ptqdTnXc5wnZ3Z8odFu38u76Eau8BSeOpDjktHcEah1s723w38HoASbtSBUTPiFY5shYA7ytXM70SeNj2vUNZ4Jg6xOQh3P5jNGpzez8PbAVcWM7F3237iI4VPURtbvMGpc1tvhw4VNJi4Cngb22P1j3jdrf5E8DXJH2M6oT1MaP4jz0knUcV8uPLeZWTgU0AbJ9JdZ7lcKrn5zwGvH/I6xzFv6+IiGjQWDvEFBERbUpARERErQRERETUSkBEREStBERERNRKQET0IenIcpfXl3a6lohOSkBEPNsc4GflZyMkjWtq2RHDJQER0ULSVsABVLdpmF3axkn6f5JuKffZ/3Bp30fSNZJ+Lek6SVtLOkbS6S3Lu1TSQWX4j5K+IOnXwKsknSRpUVnuvN67BkvaSdKVZbk3SHqxpHMkHdmy3O9IqrtLbcSwSUBErGkW8APbvwUekPQKYC4wHdjT9h7Ad8rtHS4APmJ7BnAI8PgAy96S6h79M2z/DDjd9j62Xw5sDry5TPcdqltzzwBeDdxL9Q3/YwAkbVPaN5S70cZ6KgERsaY5VM8WoPycQ/Xhf1a5/Tu2VwG7APfaXlTaHuntX4ungItaxg9W9dTCm4HXAS+TtDUwyfbFZbl/tv2Y7Z9Q3XtoQqnpojbWFzEkY+peTBFrI2l7qg/q3SWZ6h4/proxXLtWs+YfXpu1DP/Z9lNlXZsBX6V6mt0ySaf0mbbOOcBRVIe+hnyfnYiBZA8i4hlvB75le1q52+sU4PdUj7P8q3L7994guR2YKGmf0rZ16b8T2FPSRpKmUD35rE5vGNxfznu8HcD2o8Dy3vMNqp6RvkWZ9mzgo2W6xcO21RH9SEBEPGMOcHGftouAiVS3jr6pnGB+d3nM5buAr5S2K6g+9H9OFSqLgS8DN9StqDyo52vALVR3JG3dS3kv8DeSbgKuAV5Q5vkDcBvwjaFuaEQ7cjfXiFGi7EncDOxt++FO1xMbvuxBRIwC5al/twFfSTjESMkeRERE1MoeRERE1EpARERErQRERETUSkBEREStBERERNT6/5WLAWlxQhHkAAAAAElFTkSuQmCC",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import json\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "def load_jsonl_files_recursively(dir_path):\n",
-    "    all_data = []\n",
-    "    \n",
-    "    for root, _, files in os.walk(dir_path):\n",
-    "        for file in files:\n",
-    "            if file.endswith(\".jsonl\"):\n",
-    "                file_path = os.path.join(root, file)\n",
-    "                with open(file_path, \"r\") as f:\n",
-    "                    file_data = [json.loads(line) for line in f]\n",
-    "                    all_data.extend(file_data)\n",
-    "    \n",
-    "    return all_data\n",
-    "\n",
-    "def extract_accuracies(data):\n",
-    "    accuracies = []\n",
-    "    for record in data:\n",
-    "        if 'final_report' in record:\n",
-    "            accuracy = record['final_report']['accuracy']\n",
-    "            accuracies.append(accuracy)\n",
-    "    return accuracies\n",
-    "\n",
-    "# Load the data recursively\n",
-    "dir_path = \"evals\"\n",
-    "data = load_jsonl_files_recursively(dir_path)\n",
-    "\n",
-    "# Extract accuracies from the data\n",
-    "accuracies = extract_accuracies(data)\n",
-    "\n",
-    "# Plot the accuracies in a histogram chart\n",
-    "plt.hist(accuracies, bins=100, range=(0, 1), edgecolor='black')\n",
-    "plt.xlabel(\"Accuracy\")\n",
-    "plt.ylabel(\"Frequency\")\n",
-    "plt.title(\"Accuracy Histogram\")\n",
-    "plt.show()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Counts for each eval_name:\n",
-      "test-match.s1.simple-v0: 22\n",
-      "None: 45\n",
-      "test-fuzzy-match.s1.simple-v0: 2\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import json\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "def print_graph():\n",
-    "    directory = 'evals/'\n",
-    "    files = os.listdir(directory)\n",
-    "    \n",
-    "    eval_name_counter = {}\n",
-    "    \n",
-    "    for file in files:\n",
-    "        if file.endswith(\".jsonl\"):\n",
-    "            with open(os.path.join(directory, file), 'r') as f:\n",
-    "                jsonl_content = f.read()\n",
-    "                \n",
-    "            # Read the JSONL content into a DataFrame\n",
-    "            data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n",
-    "            df = pd.DataFrame(data)\n",
-    "\n",
-    "            if 'spec' not in df.columns:\n",
-    "                continue\n",
-    "\n",
-    "            # Extract the \"eval_name\" from the \"spec\" dictionaries\n",
-    "            df['eval_name'] = df['spec'].apply(lambda x: x['eval_name'] if isinstance(x, dict) else None)\n",
-    "\n",
-    "            for eval_name in df['eval_name']:\n",
-    "                if eval_name not in eval_name_counter:\n",
-    "                    eval_name_counter[eval_name] = 0\n",
-    "                eval_name_counter[eval_name] += 1\n",
-    "\n",
-    "    # Print the counts\n",
-    "    print(\"Counts for each eval_name:\")\n",
-    "    for eval_name, count in eval_name_counter.items():\n",
-    "        print(f\"{eval_name}: {count}\")\n",
-    "\n",
-    "print_graph()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Counts for each eval_name:\n",
-      "test-match.s1.simple-v0: 22\n",
-      "test-fuzzy-match.s1.simple-v0: 2\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import json\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "def print_graph():\n",
-    "    directory = 'evals/'\n",
-    "    files = os.listdir(directory)\n",
-    "    \n",
-    "    eval_name_counter = {}\n",
-    "    \n",
-    "    for file in files:\n",
-    "        if file.endswith(\".jsonl\"):\n",
-    "            with open(os.path.join(directory, file), 'r') as f:\n",
-    "                jsonl_content = f.read()\n",
-    "                \n",
-    "            # Read the JSONL content into a DataFrame\n",
-    "            data = [json.loads(line) for line in jsonl_content.split('\\n') if line]\n",
-    "            df = pd.DataFrame(data)\n",
-    "\n",
-    "            if 'spec' not in df.columns:\n",
-    "                continue\n",
-    "\n",
-    "            # Filter the DataFrame to only include rows with the \"spec\" key\n",
-    "            spec_df = df[df['spec'].notna()].copy()\n",
-    "\n",
-    "            # Extract the \"eval_name\" from the \"spec\" dictionaries\n",
-    "            spec_df.loc[:, 'eval_name'] = spec_df['spec'].apply(lambda x: x['eval_name'])\n",
-    "\n",
-    "            for eval_name in spec_df['eval_name']:\n",
-    "                if eval_name not in eval_name_counter:\n",
-    "                    eval_name_counter[eval_name] = 0\n",
-    "                eval_name_counter[eval_name] += 1\n",
-    "\n",
-    "    # Print the counts\n",
-    "    print(\"Counts for each eval_name:\")\n",
-    "    for eval_name, count in eval_name_counter.items():\n",
-    "        print(f\"{eval_name}: {count}\")\n",
-    "\n",
-    "print_graph()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3.10.0 64-bit",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  },
-  "orig_nbformat": 4,
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 000000000..12a0390ef
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,101 @@
+# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.1.1"
+description = "Backport of PEP 654 (exception groups)"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
+    {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
+]
+
+[package.extras]
+test = ["pytest (>=6)"]
+
+[[package]]
+name = "iniconfig"
+version = "2.0.0"
+description = "brain-dead simple config-ini parsing"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
+    {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
+]
+
+[[package]]
+name = "packaging"
+version = "23.1"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
+    {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
+]
+
+[[package]]
+name = "pluggy"
+version = "1.0.0"
+description = "plugin and hook calling mechanisms for python"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
+    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+]
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
+
+[[package]]
+name = "pytest"
+version = "7.3.2"
+description = "pytest: simple powerful testing with Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"},
+    {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<2.0"
+tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.9"
+content-hash = "c5b989915c413ab901c39dd0c4f3b0fe203558c2879952a2460a52bda4f3e857"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..2c099a5b8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "agbenchmark"
+version = "0.1.0"
+description = "Benchmarking the performance of agents far and wide, regardless of how they are set up and how they work"
+authors = ["Silen Naihin <silen.naihin@gmail.com>"]
+license = "MIT"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.9"
+pytest = "^7.3.2"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests", "benchmark/challenges",
+]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index b1c5914ad..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-aiodocker==0.21.0
-aiohttp==3.8.4
-aiosignal==1.3.1
-asn1crypto==1.5.1
-async-timeout==4.0.2
-attrs==23.1.0
-backoff==2.2.1
-blobfile==2.0.1
-cachetools==5.3.0
-certifi==2022.12.7
-cffi==1.15.1
-charset-normalizer==2.1.1
-click==8.1.3
-colorama==0.4.6
-contourpy==1.0.7
-cryptography==40.0.2
-cycler==0.11.0
-dataclasses-json==0.5.7
-docker==6.0.1
-evals==1.0.2.post1
-filelock==3.11.0
-fire==0.5.0
-fonttools==4.39.3
-frozenlist==1.3.3
-gptcache==0.1.13
-greenlet==2.0.2
-idna==3.4
-importlib-resources==5.12.0
-joblib==1.2.0
-kiwisolver==1.4.4
-langchain==0.0.142
-langdetect==1.0.9
-lxml==4.9.2
-lz4==4.3.2
-marshmallow==3.19.0
-marshmallow-enum==1.5.1
-matplotlib==3.7.1
-mock==5.0.2
-multidict==6.0.4
-mypy==1.2.0
-mypy-extensions==1.0.0
-nltk==3.8.1
-numexpr==2.8.4
-numpy==1.24.2
-openai==0.27.4
-openapi-schema-pydantic==1.2.4
-oscrypto==1.3.0
-packaging==23.1
-pandas==1.5.3
-Pillow==9.5.0
-portalocker==2.7.0
-pyarrow==10.0.1
-pycparser==2.21
-pycryptodomex==3.17
-pydantic==1.10.7
-PyJWT==2.6.0
-pyOpenSSL==23.1.1
-pyparsing==3.0.9
-python-dateutil==2.8.2
-pytz==2023.3
-PyYAML==6.0
-pyzstd==0.15.6
-regex==2023.3.23
-requests==2.28.2
-sacrebleu==2.3.1
-setuptools-scm==7.1.0
-six==1.16.0
-snowflake-connector-python==3.0.2
-SQLAlchemy==1.4.47
-tabulate==0.9.0
-tenacity==8.2.2
-termcolor==2.2.0
-tiktoken==0.3.3
-tomli==2.0.1
-tqdm==4.65.0
-typing-inspect==0.8.0
-typing_extensions==4.5.0
-urllib3==1.26.15
-websocket-client==1.5.1
-yarl==1.8.2
-zipp==3.15.0
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_api.py b/tests/test_api.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_workspace_manager.py b/tests/test_workspace_manager.py
new file mode 100644
index 000000000..e69de29bb
-- 
cgit v1.2.3


From f37981c3884264c50f2af93799f3667b5dc42cca Mon Sep 17 00:00:00 2001
From: scarletpan <hpanad@connect.ust.hk>
Date: Mon, 19 Jun 2023 12:39:34 +0000
Subject: init first challenge template

---
 agbenchmark/benchmark/challenges/Challenge.py      | 11 +++++++
 .../benchmark/challenges/retrieval/r1_test.py      | 29 +++++++++++++++++
 data/README.md                                     | 37 ++++++++++++++++++++++
 data/retrieval/r1_test_data_0.json                 | 10 ++++++
 data/retrieval/r1_test_data_1.json                 | 10 ++++++
 examples/basic_gpt_agent.py                        | 26 +++++++++++++++
 6 files changed, 123 insertions(+)
 create mode 100644 data/README.md
 create mode 100644 data/retrieval/r1_test_data_0.json
 create mode 100644 data/retrieval/r1_test_data_1.json
 create mode 100644 examples/basic_gpt_agent.py

diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py
index e69de29bb..bed522a85 100644
--- a/agbenchmark/benchmark/challenges/Challenge.py
+++ b/agbenchmark/benchmark/challenges/Challenge.py
@@ -0,0 +1,11 @@
+import json
+
+class Challenge(object):
+    def __init__(self, json_data):
+        self.json_data = json_data
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        with open(json_file) as f:
+            json_data = json.load(f)
+        return cls(json_data)
\ No newline at end of file
diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py
index e69de29bb..f300d094c 100644
--- a/agbenchmark/benchmark/challenges/retrieval/r1_test.py
+++ b/agbenchmark/benchmark/challenges/retrieval/r1_test.py
@@ -0,0 +1,29 @@
+from ..Challenge import Challenge
+
+
+class RetrievelChallenge(Challenge):
+    """ Chanllenge for information-retrieval """
+    def __init__(self, json_data):
+        self.json_data = json_data
+        assert self.json_data["category"] == "information-retrieval"
+
+    @property
+    def agent_input(self):
+        return self.json_data["query"]
+
+    def scoring(self, content):
+        for should_contain_word in self.json_data["ground"]["should_contain"]:
+            if should_contain_word not in content:
+                return 0.
+        
+        for should_not_contain_word in self.json_data["ground"]["should_not_contain"]:
+            if should_not_contain_word in content:
+                return 0.
+        return 1.
+
+    def run(self, output_file):
+        output = open(output_file).read().strip()
+
+        score = self.scoring(output)
+
+        return score
\ No newline at end of file
diff --git a/data/README.md b/data/README.md
new file mode 100644
index 000000000..d3e32b563
--- /dev/null
+++ b/data/README.md
@@ -0,0 +1,37 @@
+# Challenges Data Schema of Benchmark
+
+## General challenges
+Input:
+- **category** (str): information-retrieval
+- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"]
+
+
+
+## Information-retrieval challenges
+Input:
+- **category** (str): information-retrieval
+- **query** (str): the question need to be solve.
+- **ground** (dict): The ground truth.
+    - **answer** (str): The raw text of ground truth answer
+    - **should_contain** (list): the exact strings that is required in the final answer 
+    - **should_not_contain** (list): the exact strings that should not be in the final answer 
+- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"]
+
+Example:
+```python
+{
+    "category": "information-retrieval",
+    "query": "what is the capital of America",
+    "ground": {
+        "answer": "Washington",
+        "should_contain": ["Washington"],
+        "should_not_contain": ["New York", "Los Angeles", "San Francisco"]
+    },
+    "difficulty_level": "easy"
+}
+```
+
+
+
+Output:
+- **score** (float): scores range from [0, 1]
\ No newline at end of file
diff --git a/data/retrieval/r1_test_data_0.json b/data/retrieval/r1_test_data_0.json
new file mode 100644
index 000000000..a64f7e0cc
--- /dev/null
+++ b/data/retrieval/r1_test_data_0.json
@@ -0,0 +1,10 @@
+{
+    "category": "information-retrieval",
+    "query": "what is the capital of America",
+    "ground": {
+        "answer": "Washington",
+        "should_contain": ["Washington"],
+        "should_not_contain": ["New York", "Los Angeles", "San Francisco"]
+    },
+    "difficulty_level": "easy"
+}
\ No newline at end of file
diff --git a/data/retrieval/r1_test_data_1.json b/data/retrieval/r1_test_data_1.json
new file mode 100644
index 000000000..73dec4cdd
--- /dev/null
+++ b/data/retrieval/r1_test_data_1.json
@@ -0,0 +1,10 @@
+{
+    "category": "information-retrieval",
+    "query": "The Nobel Prize in Literature 2012",
+    "ground": {
+        "answer": "Mo Yan",
+        "should_contain": ["Mo Yan"],
+        "should_not_contain": ["Murakami Haruki"]
+    },
+    "difficulty_level": "easy"
+}
\ No newline at end of file
diff --git a/examples/basic_gpt_agent.py b/examples/basic_gpt_agent.py
new file mode 100644
index 000000000..e2cc380c8
--- /dev/null
+++ b/examples/basic_gpt_agent.py
@@ -0,0 +1,26 @@
+import json
+import openai
+from agbenchmark.benchmark.challenges.retrieval.r1_test import RetrievelChallenge
+
+
+def basic_gpt_agent(challenge_file):
+    challenge = RetrievelChallenge.from_json_file(challenge_file)
+
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo-0613",
+        messages=[{"role": "user", "content": challenge.agent_input}])
+    answer = response["choices"][0]["message"]["content"]
+
+    output_file = "./basic_gpt_agent_retrieval_results.txt"
+    with open(output_file, "w") as f:
+        f.write(answer)
+
+    print("QUERY       : ", challenge.agent_input)
+    print("AGENT ANSWER: ", answer)
+
+    score = challenge.run(output_file)
+
+    print("AGENT SCORE : ", score)
+
+if __name__ == "__main__":
+    basic_gpt_agent("./data/retrieval/r1_test_data_1.json")
-- 
cgit v1.2.3


From 1eb278f3cc36ad5087f3ec30ea8c4e6fc8efca3a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 19 Jun 2023 09:53:30 -0400
Subject: Update README.md

---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 820c0f51e..02f792b70 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,14 @@
 
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
-#### MVP: function calls api, api returns presigned url, folder is uploaded, write file challenge is measured, score is given
+Simple boilerplate code that spins up a webserver to plug their agent into. We call multiple tasks by invoking different pytest commands on folders and once the agent stops or reaches 50 loops (which they will have to define). We handle the deletion of files after a run loop ends. Then we call call the POST request for the next task. Then we will spit out a combined benchmark once all tests run
 
-#### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
+- Agent adds tests by adding to our repo
+- Agent abstracted from benchmark
+- Scalable (parallel servers running tests)
+- Better standardization
+
+##### Diagrams (out of date, cloud oriented): https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
 
 ## Contributing
 
-- 
cgit v1.2.3


From b7deb984f7749db4ba3c62dc0a34ddbda966af02 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Wed, 21 Jun 2023 11:43:18 -0400
Subject: start click, fixtures, types, challenge creation, mock run -stable
 (#37)

---
 README.md                                          | 157 +++--
 agbenchmark/Challenge.py                           |  32 +
 agbenchmark/benchmark/__init__.py                  |   0
 agbenchmark/benchmark/benchmark.py                 |   1 -
 agbenchmark/benchmark/challenges/Challenge.py      |  11 -
 agbenchmark/benchmark/challenges/__init__.py       |   0
 .../benchmark/challenges/adaptability/a1_test.py   |   0
 .../challenges/basic_abilities/browse_test.py      |   0
 .../challenges/basic_abilities/read_file_test.py   |   0
 .../basic_abilities/remember_context_test.py       |   0
 .../challenges/basic_abilities/write_file_test.py  |   0
 agbenchmark/benchmark/challenges/code/c1_test.py   |   0
 agbenchmark/benchmark/challenges/memory/m1_test.py |   0
 .../benchmark/challenges/retrieval/r1_test.py      |  29 -
 agbenchmark/benchmark/challenges/utils.py          |   0
 .../challenges/web_navigation/wn1_test.py          |   0
 .../benchmark/challenges/writing/w1_test.py        |   0
 agbenchmark/benchmark/run.py                       |   1 -
 agbenchmark/challenges/README.md                   |  42 ++
 agbenchmark/challenges/__init__.py                 |   0
 agbenchmark/challenges/adaptability/a1_test.py     |   0
 agbenchmark/challenges/code/c1_test.py             |   0
 agbenchmark/challenges/define_task_types.py        |  29 +
 agbenchmark/challenges/memory/m1_test.py           |   0
 agbenchmark/challenges/retrieval/Retrieval.py      |  27 +
 agbenchmark/challenges/retrieval/r1/r1_data.json   |  11 +
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  25 +
 agbenchmark/challenges/web_navigation/wn1_test.py  |   0
 agbenchmark/challenges/writing/w1_test.py          |   0
 agbenchmark/config.json                            |   5 +
 agbenchmark/conftest.py                            |  43 ++
 agbenchmark/metrics.py                             |  10 +
 agbenchmark/mocks/basic_gpt_agent.py               |  20 +
 agbenchmark/mocks/tests/retrieval_manual.py        |  10 +
 agbenchmark/server/__init__.py                     |   0
 agbenchmark/server/api.py                          |   0
 agbenchmark/server/utils.py                        |   0
 agbenchmark/start_benchmark.py                     |  48 ++
 agbenchmark/tests/basic_abilities/browse_test.py   |   0
 .../tests/basic_abilities/read_file_test.py        |   0
 .../tests/basic_abilities/remember_context_test.py |   0
 .../tests/basic_abilities/write_file_test.py       |   0
 agbenchmark/utils.py                               |   1 +
 agbenchmark/workspace/__init__.py                  |   0
 agbenchmark/workspace/cloud_services/aws.py        |   0
 agbenchmark/workspace/workspace_manager.py         |   1 -
 data/README.md                                     |  37 --
 data/retrieval/r1_test_data_0.json                 |  10 -
 data/retrieval/r1_test_data_1.json                 |  10 -
 examples/basic_gpt_agent.py                        |  26 -
 file_to_check.txt                                  |   1 +
 poetry.lock                                        | 669 ++++++++++++++++++++-
 pyproject.toml                                     |  13 +-
 tests/__init__.py                                  |   0
 tests/test_api.py                                  |   0
 tests/test_benchmark.py                            |   0
 tests/test_workspace_manager.py                    |   0
 57 files changed, 1052 insertions(+), 217 deletions(-)
 create mode 100644 agbenchmark/Challenge.py
 delete mode 100644 agbenchmark/benchmark/__init__.py
 delete mode 100644 agbenchmark/benchmark/benchmark.py
 delete mode 100644 agbenchmark/benchmark/challenges/Challenge.py
 delete mode 100644 agbenchmark/benchmark/challenges/__init__.py
 delete mode 100644 agbenchmark/benchmark/challenges/adaptability/a1_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/browse_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/code/c1_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/memory/m1_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/retrieval/r1_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/utils.py
 delete mode 100644 agbenchmark/benchmark/challenges/web_navigation/wn1_test.py
 delete mode 100644 agbenchmark/benchmark/challenges/writing/w1_test.py
 delete mode 100644 agbenchmark/benchmark/run.py
 create mode 100644 agbenchmark/challenges/README.md
 create mode 100644 agbenchmark/challenges/__init__.py
 create mode 100644 agbenchmark/challenges/adaptability/a1_test.py
 create mode 100644 agbenchmark/challenges/code/c1_test.py
 create mode 100644 agbenchmark/challenges/define_task_types.py
 create mode 100644 agbenchmark/challenges/memory/m1_test.py
 create mode 100644 agbenchmark/challenges/retrieval/Retrieval.py
 create mode 100644 agbenchmark/challenges/retrieval/r1/r1_data.json
 create mode 100644 agbenchmark/challenges/retrieval/r1/r1_test.py
 create mode 100644 agbenchmark/challenges/web_navigation/wn1_test.py
 create mode 100644 agbenchmark/challenges/writing/w1_test.py
 create mode 100644 agbenchmark/config.json
 create mode 100644 agbenchmark/conftest.py
 create mode 100644 agbenchmark/metrics.py
 create mode 100644 agbenchmark/mocks/basic_gpt_agent.py
 create mode 100644 agbenchmark/mocks/tests/retrieval_manual.py
 delete mode 100644 agbenchmark/server/__init__.py
 delete mode 100644 agbenchmark/server/api.py
 delete mode 100644 agbenchmark/server/utils.py
 create mode 100644 agbenchmark/start_benchmark.py
 create mode 100644 agbenchmark/tests/basic_abilities/browse_test.py
 create mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py
 create mode 100644 agbenchmark/tests/basic_abilities/remember_context_test.py
 create mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py
 create mode 100644 agbenchmark/utils.py
 delete mode 100644 agbenchmark/workspace/__init__.py
 delete mode 100644 agbenchmark/workspace/cloud_services/aws.py
 delete mode 100644 agbenchmark/workspace/workspace_manager.py
 delete mode 100644 data/README.md
 delete mode 100644 data/retrieval/r1_test_data_0.json
 delete mode 100644 data/retrieval/r1_test_data_1.json
 delete mode 100644 examples/basic_gpt_agent.py
 create mode 100644 file_to_check.txt
 delete mode 100644 tests/__init__.py
 delete mode 100644 tests/test_api.py
 delete mode 100644 tests/test_benchmark.py
 delete mode 100644 tests/test_workspace_manager.py

diff --git a/README.md b/README.md
index 02f792b70..216f1202c 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,8 @@
-# agbenchmark
+# Auto-GPT Benchmark
 
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
-Simple boilerplate code that spins up a webserver to plug their agent into. We call multiple tasks by invoking different pytest commands on folders and once the agent stops or reaches 50 loops (which they will have to define). We handle the deletion of files after a run loop ends. Then we call call the POST request for the next task. Then we will spit out a combined benchmark once all tests run
-
-- Agent adds tests by adding to our repo
-- Agent abstracted from benchmark
-- Scalable (parallel servers running tests)
-- Better standardization
-
-##### Diagrams (out of date, cloud oriented): https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
+##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
 
 ## Contributing
 
@@ -19,7 +12,7 @@ Simple boilerplate code that spins up a webserver to plug their agent into. We c
 - To add requirements `poetry add requirement`.
 - To run in venv `poetry run python script.py`
 
-Feel free to merge with `main` at will (but also to ask for review) - if you can't send msg in R&D chat for access.
+Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
 
 If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit
 
@@ -27,110 +20,96 @@ Let people know what beautiful code you write does, document everything well
 
 Share your progress :)
 
-## Api
+## How this works
 
-FastAPI with REST, import requests
+1. `pip install auto-gpt-benchmarks`
+2. Add boilerplate code to start webserver to your agent (run loop and stop condition)
+3. `agbenchmark start --challenge challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory
+4. We call the server to run the agent for each test
+5. Show pass rate of tests, logs, and any other metrics
 
-```
-POST hostname:8080/challenges
-{
-   "test_name": ""
-   "challenge": "memory" - optional
-}
-```
+### To run the basic existing mock (June 21)
 
-## Auth:
+1. clone the repo `auto-gpt-benchmarks`
+2. `pip install poetry`
+3. `poetry shell`
+4. `poetry install`
+5. `agbenchmark start`
+   Keep config the same and watch the logs :)
 
-get preSignedUrl from API
+#### Bonuses
+
+- You can adds tests by git cloning auto-gpt-benchmarks to your repo
+- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server
+- Simple, easy to use
+- Don't have to deal with cloud or parallelization yet
+
+### Pytest
+
+to create a test:
 
 ```
-POST preSignedUrl
-{
-   "artifacts": [{}]
-}
+@pytest.mark.parametrize(
+"server_response",
+["VARIABLE"], # VARIABLE = the query/goal you provide to the model
+indirect=True,
+)
+@pytest.mark.(VARIABLE) # VARIABLE = category of the test
+def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
+assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
 ```
 
-## Workspace
+## Api
+
+FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server
 
-Kubernetes with AWS3 or GCP
+## Workspace
 
-## Challenges
+Defined by the user on config
 
 #### Dataset
 
 Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
 
-#### Simple challenge creation through a DSL (domain specific language)
+## Repo
 
 ```
-Challenge TicTacToeCoding
-    Description "The agent should implement a basic tic-tac-toe game in Python."
-    Artifacts {
-        Code "tictactoe.py"
-    }
-    Tasks {
-        Code "Write a function to initialize the game board."
-        Code "Write a function to handle a player's turn."
-        Code "Write a function to check for a winning move."
-        Test "Write tests for the blog post model, serializer, and view."
-        Command "Run Django's test suite to ensure everything is working as expected."
-    }
-    SuccessCriteria {
-        Correctness "The game should correctly alternate between two players."
-        Correctness "The game should correctly identify a winning move."
-        Efficiency "The game should not use unnecessary computational resources."
-        Design "The solution should follow good practices for Django and Django Rest Framework."
-    }
-EndChallenge
+|-- auto-gpt-benchmarks/ **main project directory**
+| |-- metrics.py **combining scores, metrics, final evaluation**
+| |-- start_benchmark.py **entry point from cli**
+| |-- conftest.py **shared fixtures across all tests**
+| |-- Challenge.py **easy challenge creation class?**
+| |-- config.json **hostname, port, workspace folder**
+| |-- challenges/ **challenges across different domains**
+| | |-- adaptability/
+| | |-- basic_abilities/
+| | |-- code/
+| | |-- memory/
+| | |-- retrieval/
+| | |-- web_navigation/
+| | |-- writing/
+| |-- tests/ **challenges across different metrics**
+| | |-- basic_abilities/
+| | |-- interface/
+| |-- workspace/ **workspace related func**
+| | |-- **init**.py
+| | |-- workspace_manager.py **creation, deletion**
 ```
 
-#### Validators
-
-Designed to handle specific types of output (e.g., text, code, structured data)
+### Easy Challenge Creation
 
-#### Logging
-
-Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc
+tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval
 
 #### Written Challenges
 
 For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore
 
-## Repo
+#### Validators
 
-```
-|-- agbenchmark/ **main project directory**
-| |-- **init**.py
-| |-- server/
-| | |-- **init**.py
-| | |-- api.py **opens server on host and exposes urls**
-| | |-- utils.py
-| |-- benchmark/
-| | |-- **init**.py
-| | |-- benchmark.py **combining scores, metrics, final evaluation**
-| | |-- run.py **entry point. sets everything up**
-| | |-- challenges/ **challenges across different metrics**
-| | | |-- **init**.py
-| | | |-- Challenge.py **easy challenge creation through Challenge class. potentially how DSL is defined. may need to inherit challenge class like Adaptability(Challenge)**
-| | | |-- utils.py
-| | | |-- adaptability.py
-| | | |-- basic_abilities.py
-| | | |-- code.py
-| | | |-- memory.py
-| | | |-- retrieval.py
-| | | |-- web_navigation.py
-| | | |-- writing.py
-| |-- workspace/ **workspace related func**
-| | |-- **init**.py
-| | |-- workspace_manager.py **creation, deletion, preSignedUrl generation**
-| | |-- cloud_services/
-| | | |-- **init**.py
-| | | |-- aws.py **not finalized, but write, read, and del files**
-|-- tests/ **test func of agbenchmark**
-| |-- **init**.py
-| |-- test_api.py
-| |-- test_benchmark.py
-| |-- test_workspace_manager.py
-```
+Designed to handle specific types of output (e.g., text, code, structured data)
+
+#### Logging
+
+Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc
 
 Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility
diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
new file mode 100644
index 000000000..20bf55853
--- /dev/null
+++ b/agbenchmark/Challenge.py
@@ -0,0 +1,32 @@
+import os
+from typing import Optional
+
+
+class Challenge:
+    """The parent class to all specific challenges classes.
+    Defines helper methods for running a challenge"""
+
+    @staticmethod
+    def open_file(workspace: str, filename: str):
+        script_dir = os.path.abspath(workspace)
+        workspace_dir = os.path.join(script_dir, filename)
+        with open(workspace_dir, "r") as f:
+            return f.read()
+
+    @staticmethod
+    def write_to_file(workspace: str, filename: str, content: str):
+        script_dir = os.path.abspath(workspace)
+        print("Writing file at", script_dir)
+        workspace_dir = os.path.join(script_dir, filename)
+
+        # Open the file in write mode.
+        with open(workspace_dir, "w") as f:
+            # Write the content to the file.
+            f.write(content)
+
+    def get_filenames_in_workspace(self, workspace: str):
+        return [
+            filename
+            for filename in os.listdir(workspace)
+            if os.path.isfile(os.path.join(workspace, filename))
+        ]
diff --git a/agbenchmark/benchmark/__init__.py b/agbenchmark/benchmark/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/benchmark.py b/agbenchmark/benchmark/benchmark.py
deleted file mode 100644
index 2f8124272..000000000
--- a/agbenchmark/benchmark/benchmark.py
+++ /dev/null
@@ -1 +0,0 @@
-# how well the agent did on the challenges, the metrics calculation
diff --git a/agbenchmark/benchmark/challenges/Challenge.py b/agbenchmark/benchmark/challenges/Challenge.py
deleted file mode 100644
index bed522a85..000000000
--- a/agbenchmark/benchmark/challenges/Challenge.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import json
-
-class Challenge(object):
-    def __init__(self, json_data):
-        self.json_data = json_data
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        with open(json_file) as f:
-            json_data = json.load(f)
-        return cls(json_data)
\ No newline at end of file
diff --git a/agbenchmark/benchmark/challenges/__init__.py b/agbenchmark/benchmark/challenges/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/adaptability/a1_test.py b/agbenchmark/benchmark/challenges/adaptability/a1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py b/agbenchmark/benchmark/challenges/basic_abilities/browse_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/read_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py b/agbenchmark/benchmark/challenges/basic_abilities/remember_context_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py b/agbenchmark/benchmark/challenges/basic_abilities/write_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/code/c1_test.py b/agbenchmark/benchmark/challenges/code/c1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/memory/m1_test.py b/agbenchmark/benchmark/challenges/memory/m1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/retrieval/r1_test.py b/agbenchmark/benchmark/challenges/retrieval/r1_test.py
deleted file mode 100644
index f300d094c..000000000
--- a/agbenchmark/benchmark/challenges/retrieval/r1_test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from ..Challenge import Challenge
-
-
-class RetrievelChallenge(Challenge):
-    """ Chanllenge for information-retrieval """
-    def __init__(self, json_data):
-        self.json_data = json_data
-        assert self.json_data["category"] == "information-retrieval"
-
-    @property
-    def agent_input(self):
-        return self.json_data["query"]
-
-    def scoring(self, content):
-        for should_contain_word in self.json_data["ground"]["should_contain"]:
-            if should_contain_word not in content:
-                return 0.
-        
-        for should_not_contain_word in self.json_data["ground"]["should_not_contain"]:
-            if should_not_contain_word in content:
-                return 0.
-        return 1.
-
-    def run(self, output_file):
-        output = open(output_file).read().strip()
-
-        score = self.scoring(output)
-
-        return score
\ No newline at end of file
diff --git a/agbenchmark/benchmark/challenges/utils.py b/agbenchmark/benchmark/challenges/utils.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/benchmark/challenges/web_navigation/wn1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/challenges/writing/w1_test.py b/agbenchmark/benchmark/challenges/writing/w1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/benchmark/run.py b/agbenchmark/benchmark/run.py
deleted file mode 100644
index b07ac6b55..000000000
--- a/agbenchmark/benchmark/run.py
+++ /dev/null
@@ -1 +0,0 @@
-# running all of the different challenges
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
new file mode 100644
index 000000000..40281c99b
--- /dev/null
+++ b/agbenchmark/challenges/README.md
@@ -0,0 +1,42 @@
+# Challenges Data Schema of Benchmark
+
+## General challenges
+
+Input:
+
+- **category** (str): information-retrieval
+- **difficulty**(str): the difficulty of this query. choices from
+
+## Information-retrieval challenges
+
+Input:
+
+- **category** (str): information-retrieval
+- **task** (str): the question the agent needs to be solve.
+- **ground** (dict): The ground truth.
+  - **answer** (str): The raw text of ground truth answer
+  - **should_contain** (list): the exact strings that is required in the final answer
+  - **should_not_contain** (list): the exact strings that should not be in the final answer
+  - **files**: files that the are used for retrieval
+- **difficulty**(str): the difficulty of this query. choices from
+
+Example:
+
+```python
+{
+  "category": "retrieval",
+  "task": "What is the capital of America?",
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": ["file_to_check.txt"]
+  },
+  "difficulty": "easy"
+}
+
+```
+
+Output:
+
+- **score** (float): scores range from [0, 1]
diff --git a/agbenchmark/challenges/__init__.py b/agbenchmark/challenges/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/adaptability/a1_test.py b/agbenchmark/challenges/adaptability/a1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/c1_test.py b/agbenchmark/challenges/code/c1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
new file mode 100644
index 000000000..94b5ba533
--- /dev/null
+++ b/agbenchmark/challenges/define_task_types.py
@@ -0,0 +1,29 @@
+from pydantic import BaseModel
+from typing import List, Optional
+import json
+import os
+
+
+class Ground(BaseModel):
+    answer: str
+    should_contain: Optional[List[str]]
+    should_not_contain: Optional[List[str]]
+    files: List[str]
+
+
+class Challenge(BaseModel):
+    category: str
+    task: str
+    ground: Ground
+    difficulty: str
+
+    def serialize(self, path: str) -> None:
+        with open(path, "w") as file:
+            file.write(self.json())
+
+    @staticmethod
+    def deserialize(path: str) -> "Challenge":
+        print("Deserializing", path)
+        with open(path, "r") as file:
+            data = json.load(file)
+        return Challenge(**data)
diff --git a/agbenchmark/challenges/memory/m1_test.py b/agbenchmark/challenges/memory/m1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
new file mode 100644
index 000000000..2db22ae4d
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -0,0 +1,27 @@
+from agbenchmark.Challenge import Challenge
+from agbenchmark.challenges.define_task_types import Ground
+
+
+class RetrievalChallenge(Challenge):
+    """Challenge for information-retrieval"""
+
+    def scoring(self, content: str, ground: Ground):
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if should_contain_word not in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should exist: {should_contain_word} exists in the content"
+                    )
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if should_not_contain_word in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                    )
+
+        return 1.0
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
new file mode 100644
index 000000000..b5d5701ea
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -0,0 +1,11 @@
+{
+  "category": "retrieval",
+  "task": "What is the capital of America?",
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": ["file_to_check.txt"]
+  },
+  "difficulty": "easy"
+}
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
new file mode 100644
index 000000000..195de15f8
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -0,0 +1,25 @@
+import pytest
+from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
+from agbenchmark.challenges.define_task_types import Challenge, Ground
+import os
+
+data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json"))
+
+
+class TestRetrieval1(RetrievalChallenge):
+    """The first information-retrieval challenge"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [data.task],
+        indirect=True,
+    )
+    @pytest.mark.retrieval
+    def test_retrieval(self, workspace):
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/challenges/web_navigation/wn1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/writing/w1_test.py b/agbenchmark/challenges/writing/w1_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
new file mode 100644
index 000000000..d285627e5
--- /dev/null
+++ b/agbenchmark/config.json
@@ -0,0 +1,5 @@
+{
+  "hostname": "localhost",
+  "port": 8080,
+  "workspace": "agbenchmark/mocks/workspace"
+}
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
new file mode 100644
index 000000000..b3ca086d8
--- /dev/null
+++ b/agbenchmark/conftest.py
@@ -0,0 +1,43 @@
+import json
+import os
+import pytest
+import shutil
+from agbenchmark.mocks.tests.retrieval_manual import mock_retrieval
+import requests
+
+
+@pytest.fixture(scope="module")
+def config():
+    config_file = os.path.abspath("agbenchmark/config.json")
+    print(f"Config file: {config_file}")
+    with open(config_file, "r") as f:
+        config = json.load(f)
+    return config
+
+
+@pytest.fixture
+def workspace(config):
+    yield config["workspace"]
+    # teardown after test function completes
+    for filename in os.listdir(config["workspace"]):
+        file_path = os.path.join(config["workspace"], filename)
+        try:
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+        except Exception as e:
+            print(f"Failed to delete {file_path}. Reason: {e}")
+
+
+@pytest.fixture(autouse=True)
+def server_response(request, config):
+    task = request.param  # The task is passed in indirectly
+    print(f"Server starting at {request.module}")
+    # response = requests.post(
+    #     f"{config['hostname']}:{config['port']}", data={"task": task}
+    # )
+    # assert (
+    #     response.status_code == 200
+    # ), f"Request failed with status code {response.status_code}"
+    mock_retrieval(task, config["workspace"])
diff --git a/agbenchmark/metrics.py b/agbenchmark/metrics.py
new file mode 100644
index 000000000..bf72570a7
--- /dev/null
+++ b/agbenchmark/metrics.py
@@ -0,0 +1,10 @@
+# how well the agent did on the challenges, the metrics calculation for the future if we're tracking specific tests
+
+# POTENTIAL METRICS
+# pass/fail - in the future could have a % metric of challenge completed, milestones achieved
+# convergence - how long it took to get the result
+# difficulty of the task - defined by previous comparing to runs against other agents
+# consistency
+# time passed
+# budget used
+# divergence (distractions not related to task at hand)
diff --git a/agbenchmark/mocks/basic_gpt_agent.py b/agbenchmark/mocks/basic_gpt_agent.py
new file mode 100644
index 000000000..6aac3d191
--- /dev/null
+++ b/agbenchmark/mocks/basic_gpt_agent.py
@@ -0,0 +1,20 @@
+import json
+import openai
+
+
+def basic_gpt_agent(query) -> str:
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}]
+    )
+
+    answer = response["choices"][0]["message"]["content"]  # type: ignore
+
+    print("QUERY       : ", query)
+    print("AGENT ANSWER: ", answer)
+
+    return answer
+
+
+if __name__ == "__main__":
+    # server boilerplate example here
+    basic_gpt_agent("")
diff --git a/agbenchmark/mocks/tests/retrieval_manual.py b/agbenchmark/mocks/tests/retrieval_manual.py
new file mode 100644
index 000000000..ccb482132
--- /dev/null
+++ b/agbenchmark/mocks/tests/retrieval_manual.py
@@ -0,0 +1,10 @@
+from ..basic_gpt_agent import basic_gpt_agent
+from agbenchmark.Challenge import Challenge
+
+
+def mock_retrieval(task: str, workspace: str):
+    # Call the basic_gpt_agent to get a response.
+    response = basic_gpt_agent(task)
+
+    # Open the file in write mode.
+    Challenge.write_to_file(workspace, "file_to_check.txt", response)
diff --git a/agbenchmark/server/__init__.py b/agbenchmark/server/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/server/api.py b/agbenchmark/server/api.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/server/utils.py b/agbenchmark/server/utils.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
new file mode 100644
index 000000000..79f308435
--- /dev/null
+++ b/agbenchmark/start_benchmark.py
@@ -0,0 +1,48 @@
+import click
+import pytest
+import json
+import os
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+@click.option("--challenge", default=None, help="Specific challenge to run")
+def start(challenge):
+    """Start the benchmark tests. If a challenge flag is is provided, run the challenges with that mark."""
+    with open("agbenchmark/config.json", "r") as f:
+        config = json.load(f)
+
+    print("Current configuration:")
+    for key, value in config.items():
+        print(f"{key}: {value}")
+
+    update_config = click.confirm(
+        "\nDo you want to update these parameters?", default=False
+    )
+    if update_config:
+        config["hostname"] = click.prompt(
+            "\nPlease enter a new hostname", default=config["hostname"]
+        )
+        config["port"] = click.prompt("Please enter a new port", default=config["port"])
+        config["workspace"] = click.prompt(
+            "Please enter a new workspace path", default=config["workspace"]
+        )
+
+        with open("agbenchmark/config.json", "w") as f:
+            json.dump(config, f)
+
+    print("Starting benchmark tests...", challenge)
+    if challenge:
+        print(f"Running {challenge} challenges")
+        pytest.main(["agbenchmark", "-m", challenge, "-vs"])
+    else:
+        print("Running all challenges")
+        pytest.main(["agbenchmark", "-vs"])
+
+
+if __name__ == "__main__":
+    start()
diff --git a/agbenchmark/tests/basic_abilities/browse_test.py b/agbenchmark/tests/basic_abilities/browse_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/tests/basic_abilities/remember_context_test.py b/agbenchmark/tests/basic_abilities/remember_context_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
new file mode 100644
index 000000000..b05a7ac31
--- /dev/null
+++ b/agbenchmark/utils.py
@@ -0,0 +1 @@
+# radio charts, logs, helper functions for tests, anything else relevant.
diff --git a/agbenchmark/workspace/__init__.py b/agbenchmark/workspace/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/workspace/cloud_services/aws.py b/agbenchmark/workspace/cloud_services/aws.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/workspace/workspace_manager.py b/agbenchmark/workspace/workspace_manager.py
deleted file mode 100644
index cfcf3f7ac..000000000
--- a/agbenchmark/workspace/workspace_manager.py
+++ /dev/null
@@ -1 +0,0 @@
-# Manages the workspaces including creation, deletion, etc
diff --git a/data/README.md b/data/README.md
deleted file mode 100644
index d3e32b563..000000000
--- a/data/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Challenges Data Schema of Benchmark
-
-## General challenges
-Input:
-- **category** (str): information-retrieval
-- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"]
-
-
-
-## Information-retrieval challenges
-Input:
-- **category** (str): information-retrieval
-- **query** (str): the question need to be solve.
-- **ground** (dict): The ground truth.
-    - **answer** (str): The raw text of ground truth answer
-    - **should_contain** (list): the exact strings that is required in the final answer 
-    - **should_not_contain** (list): the exact strings that should not be in the final answer 
-- **difficulty_level**(str): the difficulty of this query. choices from ["easy", "medium", "hard"]
-
-Example:
-```python
-{
-    "category": "information-retrieval",
-    "query": "what is the capital of America",
-    "ground": {
-        "answer": "Washington",
-        "should_contain": ["Washington"],
-        "should_not_contain": ["New York", "Los Angeles", "San Francisco"]
-    },
-    "difficulty_level": "easy"
-}
-```
-
-
-
-Output:
-- **score** (float): scores range from [0, 1]
\ No newline at end of file
diff --git a/data/retrieval/r1_test_data_0.json b/data/retrieval/r1_test_data_0.json
deleted file mode 100644
index a64f7e0cc..000000000
--- a/data/retrieval/r1_test_data_0.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "category": "information-retrieval",
-    "query": "what is the capital of America",
-    "ground": {
-        "answer": "Washington",
-        "should_contain": ["Washington"],
-        "should_not_contain": ["New York", "Los Angeles", "San Francisco"]
-    },
-    "difficulty_level": "easy"
-}
\ No newline at end of file
diff --git a/data/retrieval/r1_test_data_1.json b/data/retrieval/r1_test_data_1.json
deleted file mode 100644
index 73dec4cdd..000000000
--- a/data/retrieval/r1_test_data_1.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-    "category": "information-retrieval",
-    "query": "The Nobel Prize in Literature 2012",
-    "ground": {
-        "answer": "Mo Yan",
-        "should_contain": ["Mo Yan"],
-        "should_not_contain": ["Murakami Haruki"]
-    },
-    "difficulty_level": "easy"
-}
\ No newline at end of file
diff --git a/examples/basic_gpt_agent.py b/examples/basic_gpt_agent.py
deleted file mode 100644
index e2cc380c8..000000000
--- a/examples/basic_gpt_agent.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import json
-import openai
-from agbenchmark.benchmark.challenges.retrieval.r1_test import RetrievelChallenge
-
-
-def basic_gpt_agent(challenge_file):
-    challenge = RetrievelChallenge.from_json_file(challenge_file)
-
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo-0613",
-        messages=[{"role": "user", "content": challenge.agent_input}])
-    answer = response["choices"][0]["message"]["content"]
-
-    output_file = "./basic_gpt_agent_retrieval_results.txt"
-    with open(output_file, "w") as f:
-        f.write(answer)
-
-    print("QUERY       : ", challenge.agent_input)
-    print("AGENT ANSWER: ", answer)
-
-    score = challenge.run(output_file)
-
-    print("AGENT SCORE : ", score)
-
-if __name__ == "__main__":
-    basic_gpt_agent("./data/retrieval/r1_test_data_1.json")
diff --git a/file_to_check.txt b/file_to_check.txt
new file mode 100644
index 000000000..29afa8611
--- /dev/null
+++ b/file_to_check.txt
@@ -0,0 +1 @@
+The capital of America is Washington, D.C.
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index 12a0390ef..3f1059aaf 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,5 +1,265 @@
 # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
 
+[[package]]
+name = "aiohttp"
+version = "3.8.4"
+description = "Async http client/server framework (asyncio)"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ce45967538fb747370308d3145aa68a074bdecb4f3a300869590f725ced69c1"},
+    {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b744c33b6f14ca26b7544e8d8aadff6b765a80ad6164fb1a430bbadd593dfb1a"},
+    {file = "aiohttp-3.8.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a45865451439eb320784918617ba54b7a377e3501fb70402ab84d38c2cd891b"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a86d42d7cba1cec432d47ab13b6637bee393a10f664c425ea7b305d1301ca1a3"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee3c36df21b5714d49fc4580247947aa64bcbe2939d1b77b4c8dcb8f6c9faecc"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:176a64b24c0935869d5bbc4c96e82f89f643bcdf08ec947701b9dbb3c956b7dd"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c844fd628851c0bc309f3c801b3a3d58ce430b2ce5b359cd918a5a76d0b20cb5"},
+    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5393fb786a9e23e4799fec788e7e735de18052f83682ce2dfcabaf1c00c2c08e"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e4b09863aae0dc965c3ef36500d891a3ff495a2ea9ae9171e4519963c12ceefd"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:adfbc22e87365a6e564c804c58fc44ff7727deea782d175c33602737b7feadb6"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:147ae376f14b55f4f3c2b118b95be50a369b89b38a971e80a17c3fd623f280c9"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:eafb3e874816ebe2a92f5e155f17260034c8c341dad1df25672fb710627c6949"},
+    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c6cc15d58053c76eacac5fa9152d7d84b8d67b3fde92709195cb984cfb3475ea"},
+    {file = "aiohttp-3.8.4-cp310-cp310-win32.whl", hash = "sha256:59f029a5f6e2d679296db7bee982bb3d20c088e52a2977e3175faf31d6fb75d1"},
+    {file = "aiohttp-3.8.4-cp310-cp310-win_amd64.whl", hash = "sha256:fe7ba4a51f33ab275515f66b0a236bcde4fb5561498fe8f898d4e549b2e4509f"},
+    {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3d8ef1a630519a26d6760bc695842579cb09e373c5f227a21b67dc3eb16cfea4"},
+    {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b3f2e06a512e94722886c0827bee9807c86a9f698fac6b3aee841fab49bbfb4"},
+    {file = "aiohttp-3.8.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3a80464982d41b1fbfe3154e440ba4904b71c1a53e9cd584098cd41efdb188ef"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b631e26df63e52f7cce0cce6507b7a7f1bc9b0c501fcde69742130b32e8782f"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f43255086fe25e36fd5ed8f2ee47477408a73ef00e804cb2b5cba4bf2ac7f5e"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4d347a172f866cd1d93126d9b239fcbe682acb39b48ee0873c73c933dd23bd0f"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3fec6a4cb5551721cdd70473eb009d90935b4063acc5f40905d40ecfea23e05"},
+    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80a37fe8f7c1e6ce8f2d9c411676e4bc633a8462844e38f46156d07a7d401654"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d1e6a862b76f34395a985b3cd39a0d949ca80a70b6ebdea37d3ab39ceea6698a"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cd468460eefef601ece4428d3cf4562459157c0f6523db89365202c31b6daebb"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:618c901dd3aad4ace71dfa0f5e82e88b46ef57e3239fc7027773cb6d4ed53531"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:652b1bff4f15f6287550b4670546a2947f2a4575b6c6dff7760eafb22eacbf0b"},
+    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80575ba9377c5171407a06d0196b2310b679dc752d02a1fcaa2bc20b235dbf24"},
+    {file = "aiohttp-3.8.4-cp311-cp311-win32.whl", hash = "sha256:bbcf1a76cf6f6dacf2c7f4d2ebd411438c275faa1dc0c68e46eb84eebd05dd7d"},
+    {file = "aiohttp-3.8.4-cp311-cp311-win_amd64.whl", hash = "sha256:6e74dd54f7239fcffe07913ff8b964e28b712f09846e20de78676ce2a3dc0bfc"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:880e15bb6dad90549b43f796b391cfffd7af373f4646784795e20d92606b7a51"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb96fa6b56bb536c42d6a4a87dfca570ff8e52de2d63cabebfd6fb67049c34b6"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a6cadebe132e90cefa77e45f2d2f1a4b2ce5c6b1bfc1656c1ddafcfe4ba8131"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f352b62b45dff37b55ddd7b9c0c8672c4dd2eb9c0f9c11d395075a84e2c40f75"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ab43061a0c81198d88f39aaf90dae9a7744620978f7ef3e3708339b8ed2ef01"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9cb1565a7ad52e096a6988e2ee0397f72fe056dadf75d17fa6b5aebaea05622"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:1b3ea7edd2d24538959c1c1abf97c744d879d4e541d38305f9bd7d9b10c9ec41"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:7c7837fe8037e96b6dd5cfcf47263c1620a9d332a87ec06a6ca4564e56bd0f36"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:3b90467ebc3d9fa5b0f9b6489dfb2c304a1db7b9946fa92aa76a831b9d587e99"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:cab9401de3ea52b4b4c6971db5fb5c999bd4260898af972bf23de1c6b5dd9d71"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d1f9282c5f2b5e241034a009779e7b2a1aa045f667ff521e7948ea9b56e0c5ff"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-win32.whl", hash = "sha256:5e14f25765a578a0a634d5f0cd1e2c3f53964553a00347998dfdf96b8137f777"},
+    {file = "aiohttp-3.8.4-cp36-cp36m-win_amd64.whl", hash = "sha256:4c745b109057e7e5f1848c689ee4fb3a016c8d4d92da52b312f8a509f83aa05e"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:aede4df4eeb926c8fa70de46c340a1bc2c6079e1c40ccf7b0eae1313ffd33519"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ddaae3f3d32fc2cb4c53fab020b69a05c8ab1f02e0e59665c6f7a0d3a5be54f"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4eb3b82ca349cf6fadcdc7abcc8b3a50ab74a62e9113ab7a8ebc268aad35bb9"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bcb89336efa095ea21b30f9e686763f2be4478f1b0a616969551982c4ee4c3b"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c08e8ed6fa3d477e501ec9db169bfac8140e830aa372d77e4a43084d8dd91ab"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c6cd05ea06daca6ad6a4ca3ba7fe7dc5b5de063ff4daec6170ec0f9979f6c332"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b7a00a9ed8d6e725b55ef98b1b35c88013245f35f68b1b12c5cd4100dddac333"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:de04b491d0e5007ee1b63a309956eaed959a49f5bb4e84b26c8f5d49de140fa9"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:40653609b3bf50611356e6b6554e3a331f6879fa7116f3959b20e3528783e699"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dbf3a08a06b3f433013c143ebd72c15cac33d2914b8ea4bea7ac2c23578815d6"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:854f422ac44af92bfe172d8e73229c270dc09b96535e8a548f99c84f82dde241"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-win32.whl", hash = "sha256:aeb29c84bb53a84b1a81c6c09d24cf33bb8432cc5c39979021cc0f98c1292a1a"},
+    {file = "aiohttp-3.8.4-cp37-cp37m-win_amd64.whl", hash = "sha256:db3fc6120bce9f446d13b1b834ea5b15341ca9ff3f335e4a951a6ead31105480"},
+    {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fabb87dd8850ef0f7fe2b366d44b77d7e6fa2ea87861ab3844da99291e81e60f"},
+    {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:91f6d540163f90bbaef9387e65f18f73ffd7c79f5225ac3d3f61df7b0d01ad15"},
+    {file = "aiohttp-3.8.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d265f09a75a79a788237d7f9054f929ced2e69eb0bb79de3798c468d8a90f945"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d89efa095ca7d442a6d0cbc755f9e08190ba40069b235c9886a8763b03785da"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4dac314662f4e2aa5009977b652d9b8db7121b46c38f2073bfeed9f4049732cd"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe11310ae1e4cd560035598c3f29d86cef39a83d244c7466f95c27ae04850f10"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ddb2a2026c3f6a68c3998a6c47ab6795e4127315d2e35a09997da21865757f8"},
+    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e75b89ac3bd27d2d043b234aa7b734c38ba1b0e43f07787130a0ecac1e12228a"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6e601588f2b502c93c30cd5a45bfc665faaf37bbe835b7cfd461753068232074"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a5d794d1ae64e7753e405ba58e08fcfa73e3fad93ef9b7e31112ef3c9a0efb52"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a1f4689c9a1462f3df0a1f7e797791cd6b124ddbee2b570d34e7f38ade0e2c71"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3032dcb1c35bc330134a5b8a5d4f68c1a87252dfc6e1262c65a7e30e62298275"},
+    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8189c56eb0ddbb95bfadb8f60ea1b22fcfa659396ea36f6adcc521213cd7b44d"},
+    {file = "aiohttp-3.8.4-cp38-cp38-win32.whl", hash = "sha256:33587f26dcee66efb2fff3c177547bd0449ab7edf1b73a7f5dea1e38609a0c54"},
+    {file = "aiohttp-3.8.4-cp38-cp38-win_amd64.whl", hash = "sha256:e595432ac259af2d4630008bf638873d69346372d38255774c0e286951e8b79f"},
+    {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5a7bdf9e57126dc345b683c3632e8ba317c31d2a41acd5800c10640387d193ed"},
+    {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:22f6eab15b6db242499a16de87939a342f5a950ad0abaf1532038e2ce7d31567"},
+    {file = "aiohttp-3.8.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7235604476a76ef249bd64cb8274ed24ccf6995c4a8b51a237005ee7a57e8643"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea9eb976ffdd79d0e893869cfe179a8f60f152d42cb64622fca418cd9b18dc2a"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92c0cea74a2a81c4c76b62ea1cac163ecb20fb3ba3a75c909b9fa71b4ad493cf"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:493f5bc2f8307286b7799c6d899d388bbaa7dfa6c4caf4f97ef7521b9cb13719"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a63f03189a6fa7c900226e3ef5ba4d3bd047e18f445e69adbd65af433add5a2"},
+    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10c8cefcff98fd9168cdd86c4da8b84baaa90bf2da2269c6161984e6737bf23e"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bca5f24726e2919de94f047739d0a4fc01372801a3672708260546aa2601bf57"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:03baa76b730e4e15a45f81dfe29a8d910314143414e528737f8589ec60cf7391"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c29c77cc57e40f84acef9bfb904373a4e89a4e8b74e71aa8075c021ec9078c2"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:03543dcf98a6619254b409be2d22b51f21ec66272be4ebda7b04e6412e4b2e14"},
+    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:17b79c2963db82086229012cff93ea55196ed31f6493bb1ccd2c62f1724324e4"},
+    {file = "aiohttp-3.8.4-cp39-cp39-win32.whl", hash = "sha256:34ce9f93a4a68d1272d26030655dd1b58ff727b3ed2a33d80ec433561b03d67a"},
+    {file = "aiohttp-3.8.4-cp39-cp39-win_amd64.whl", hash = "sha256:41a86a69bb63bb2fc3dc9ad5ea9f10f1c9c8e282b471931be0268ddd09430b04"},
+    {file = "aiohttp-3.8.4.tar.gz", hash = "sha256:bf2e1a9162c1e441bf805a1fd166e249d574ca04e03b34f97e2928769e91ab5c"},
+]
+
+[package.dependencies]
+aiosignal = ">=1.1.2"
+async-timeout = ">=4.0.0a3,<5.0"
+attrs = ">=17.3.0"
+charset-normalizer = ">=2.0,<4.0"
+frozenlist = ">=1.1.1"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.0,<2.0"
+
+[package.extras]
+speedups = ["Brotli", "aiodns", "cchardet"]
+
+[[package]]
+name = "aiosignal"
+version = "1.3.1"
+description = "aiosignal: a list of registered asynchronous callbacks"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
+    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+]
+
+[package.dependencies]
+frozenlist = ">=1.1.0"
+
+[[package]]
+name = "async-timeout"
+version = "4.0.2"
+description = "Timeout context manager for asyncio programs"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
+    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+]
+
+[[package]]
+name = "attrs"
+version = "23.1.0"
+description = "Classes Without Boilerplate"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"},
+    {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"},
+]
+
+[package.extras]
+cov = ["attrs[tests]", "coverage[toml] (>=5.3)"]
+dev = ["attrs[docs,tests]", "pre-commit"]
+docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"]
+tests = ["attrs[tests-no-zope]", "zope-interface"]
+tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+
+[[package]]
+name = "certifi"
+version = "2023.5.7"
+description = "Python package for providing Mozilla's CA Bundle."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"},
+    {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"},
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.1.0"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"},
+    {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"},
+    {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"},
+    {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"},
+    {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"},
+    {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"},
+    {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"},
+]
+
+[[package]]
+name = "click"
+version = "8.1.3"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
+    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -25,6 +285,100 @@ files = [
 [package.extras]
 test = ["pytest (>=6)"]
 
+[[package]]
+name = "frozenlist"
+version = "1.3.3"
+description = "A list-like structure which implements collections.abc.MutableSequence"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4"},
+    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dfbac4c2dfcc082fcf8d942d1e49b6aa0766c19d3358bd86e2000bf0fa4a9cf0"},
+    {file = "frozenlist-1.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b1c63e8d377d039ac769cd0926558bb7068a1f7abb0f003e3717ee003ad85530"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fdfc24dcfce5b48109867c13b4cb15e4660e7bd7661741a391f821f23dfdca7"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c926450857408e42f0bbc295e84395722ce74bae69a3b2aa2a65fe22cb14b99"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1841e200fdafc3d51f974d9d377c079a0694a8f06de2e67b48150328d66d5483"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f470c92737afa7d4c3aacc001e335062d582053d4dbe73cda126f2d7031068dd"},
+    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:783263a4eaad7c49983fe4b2e7b53fa9770c136c270d2d4bbb6d2192bf4d9caf"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:924620eef691990dfb56dc4709f280f40baee568c794b5c1885800c3ecc69816"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ae4dc05c465a08a866b7a1baf360747078b362e6a6dbeb0c57f234db0ef88ae0"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:bed331fe18f58d844d39ceb398b77d6ac0b010d571cba8267c2e7165806b00ce"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:02c9ac843e3390826a265e331105efeab489ffaf4dd86384595ee8ce6d35ae7f"},
+    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9545a33965d0d377b0bc823dcabf26980e77f1b6a7caa368a365a9497fb09420"},
+    {file = "frozenlist-1.3.3-cp310-cp310-win32.whl", hash = "sha256:d5cd3ab21acbdb414bb6c31958d7b06b85eeb40f66463c264a9b343a4e238642"},
+    {file = "frozenlist-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b756072364347cb6aa5b60f9bc18e94b2f79632de3b0190253ad770c5df17db1"},
+    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b4395e2f8d83fbe0c627b2b696acce67868793d7d9750e90e39592b3626691b7"},
+    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14143ae966a6229350021384870458e4777d1eae4c28d1a7aa47f24d030e6678"},
+    {file = "frozenlist-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d8860749e813a6f65bad8285a0520607c9500caa23fea6ee407e63debcdbef6"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb82dbba47a8318e75f679690190c10a5e1f447fbf9df41cbc4c3afd726d88cb"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9309869032abb23d196cb4e4db574232abe8b8be1339026f489eeb34a4acfd91"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a97b4fe50b5890d36300820abd305694cb865ddb7885049587a5678215782a6b"},
+    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c188512b43542b1e91cadc3c6c915a82a5eb95929134faf7fd109f14f9892ce4"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:303e04d422e9b911a09ad499b0368dc551e8c3cd15293c99160c7f1f07b59a48"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0771aed7f596c7d73444c847a1c16288937ef988dc04fb9f7be4b2aa91db609d"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:66080ec69883597e4d026f2f71a231a1ee9887835902dbe6b6467d5a89216cf6"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:41fe21dc74ad3a779c3d73a2786bdf622ea81234bdd4faf90b8b03cad0c2c0b4"},
+    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f20380df709d91525e4bee04746ba612a4df0972c1b8f8e1e8af997e678c7b81"},
+    {file = "frozenlist-1.3.3-cp311-cp311-win32.whl", hash = "sha256:f30f1928162e189091cf4d9da2eac617bfe78ef907a761614ff577ef4edfb3c8"},
+    {file = "frozenlist-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:a6394d7dadd3cfe3f4b3b186e54d5d8504d44f2d58dcc89d693698e8b7132b32"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8df3de3a9ab8325f94f646609a66cbeeede263910c5c0de0101079ad541af332"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0693c609e9742c66ba4870bcee1ad5ff35462d5ffec18710b4ac89337ff16e27"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd4210baef299717db0a600d7a3cac81d46ef0e007f88c9335db79f8979c0d3d"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:394c9c242113bfb4b9aa36e2b80a05ffa163a30691c7b5a29eba82e937895d5e"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6327eb8e419f7d9c38f333cde41b9ae348bec26d840927332f17e887a8dcb70d"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e24900aa13212e75e5b366cb9065e78bbf3893d4baab6052d1aca10d46d944c"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3843f84a6c465a36559161e6c59dce2f2ac10943040c2fd021cfb70d58c4ad56"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:84610c1502b2461255b4c9b7d5e9c48052601a8957cd0aea6ec7a7a1e1fb9420"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c21b9aa40e08e4f63a2f92ff3748e6b6c84d717d033c7b3438dd3123ee18f70e"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:efce6ae830831ab6a22b9b4091d411698145cb9b8fc869e1397ccf4b4b6455cb"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:40de71985e9042ca00b7953c4f41eabc3dc514a2d1ff534027f091bc74416401"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-win32.whl", hash = "sha256:180c00c66bde6146a860cbb81b54ee0df350d2daf13ca85b275123bbf85de18a"},
+    {file = "frozenlist-1.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9bbbcedd75acdfecf2159663b87f1bb5cfc80e7cd99f7ddd9d66eb98b14a8411"},
+    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:034a5c08d36649591be1cbb10e09da9f531034acfe29275fc5454a3b101ce41a"},
+    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ba64dc2b3b7b158c6660d49cdb1d872d1d0bf4e42043ad8d5006099479a194e5"},
+    {file = "frozenlist-1.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47df36a9fe24054b950bbc2db630d508cca3aa27ed0566c0baf661225e52c18e"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:008a054b75d77c995ea26629ab3a0c0d7281341f2fa7e1e85fa6153ae29ae99c"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:841ea19b43d438a80b4de62ac6ab21cfe6827bb8a9dc62b896acc88eaf9cecba"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e235688f42b36be2b6b06fc37ac2126a73b75fb8d6bc66dd632aa35286238703"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca713d4af15bae6e5d79b15c10c8522859a9a89d3b361a50b817c98c2fb402a2"},
+    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ac5995f2b408017b0be26d4a1d7c61bce106ff3d9e3324374d66b5964325448"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4ae8135b11652b08a8baf07631d3ebfe65a4c87909dbef5fa0cdde440444ee4"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4ea42116ceb6bb16dbb7d526e242cb6747b08b7710d9782aa3d6732bd8d27649"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:810860bb4bdce7557bc0febb84bbd88198b9dbc2022d8eebe5b3590b2ad6c842"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ee78feb9d293c323b59a6f2dd441b63339a30edf35abcb51187d2fc26e696d13"},
+    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0af2e7c87d35b38732e810befb9d797a99279cbb85374d42ea61c1e9d23094b3"},
+    {file = "frozenlist-1.3.3-cp38-cp38-win32.whl", hash = "sha256:899c5e1928eec13fd6f6d8dc51be23f0d09c5281e40d9cf4273d188d9feeaf9b"},
+    {file = "frozenlist-1.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:7f44e24fa70f6fbc74aeec3e971f60a14dde85da364aa87f15d1be94ae75aeef"},
+    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2b07ae0c1edaa0a36339ec6cce700f51b14a3fc6545fdd32930d2c83917332cf"},
+    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ebb86518203e12e96af765ee89034a1dbb0c3c65052d1b0c19bbbd6af8a145e1"},
+    {file = "frozenlist-1.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cf820485f1b4c91e0417ea0afd41ce5cf5965011b3c22c400f6d144296ccbc0"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c11e43016b9024240212d2a65043b70ed8dfd3b52678a1271972702d990ac6d"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8fa3c6e3305aa1146b59a09b32b2e04074945ffcfb2f0931836d103a2c38f936"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:352bd4c8c72d508778cf05ab491f6ef36149f4d0cb3c56b1b4302852255d05d5"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65a5e4d3aa679610ac6e3569e865425b23b372277f89b5ef06cf2cdaf1ebf22b"},
+    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e2c1185858d7e10ff045c496bbf90ae752c28b365fef2c09cf0fa309291669"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f163d2fd041c630fed01bc48d28c3ed4a3b003c00acd396900e11ee5316b56bb"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:05cdb16d09a0832eedf770cb7bd1fe57d8cf4eaf5aced29c4e41e3f20b30a784"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8bae29d60768bfa8fb92244b74502b18fae55a80eac13c88eb0b496d4268fd2d"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eedab4c310c0299961ac285591acd53dc6723a1ebd90a57207c71f6e0c2153ab"},
+    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3bbdf44855ed8f0fbcd102ef05ec3012d6a4fd7c7562403f76ce6a52aeffb2b1"},
+    {file = "frozenlist-1.3.3-cp39-cp39-win32.whl", hash = "sha256:efa568b885bca461f7c7b9e032655c0c143d305bf01c30caf6db2854a4532b38"},
+    {file = "frozenlist-1.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:cfe33efc9cb900a4c46f91a5ceba26d6df370ffddd9ca386eb1d4f0ad97b9ea9"},
+    {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
+]
+
+[[package]]
+name = "idna"
+version = "3.4"
+description = "Internationalized Domain Names in Applications (IDNA)"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
+    {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
+]
+
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -36,6 +390,111 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
+[[package]]
+name = "multidict"
+version = "6.0.4"
+description = "multidict implementation"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
+    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
+    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
+    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
+    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
+    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
+    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
+    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
+    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
+    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
+    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
+    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
+    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+]
+
+[[package]]
+name = "openai"
+version = "0.27.8"
+description = "Python client library for the OpenAI API"
+optional = false
+python-versions = ">=3.7.1"
+files = [
+    {file = "openai-0.27.8-py3-none-any.whl", hash = "sha256:e0a7c2f7da26bdbe5354b03c6d4b82a2f34bd4458c7a17ae1a7092c3e397e03c"},
+    {file = "openai-0.27.8.tar.gz", hash = "sha256:2483095c7db1eee274cebac79e315a986c4e55207bb4fa7b82d185b3a2ed9536"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+requests = ">=2.20"
+tqdm = "*"
+
+[package.extras]
+datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
+dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"]
+embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"]
+wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"]
+
 [[package]]
 name = "packaging"
 version = "23.1"
@@ -62,6 +521,58 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "pydantic"
+version = "1.10.9"
+description = "Data validation and settings management using python type hints"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pydantic-1.10.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e692dec4a40bfb40ca530e07805b1208c1de071a18d26af4a2a0d79015b352ca"},
+    {file = "pydantic-1.10.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c52eb595db83e189419bf337b59154bdcca642ee4b2a09e5d7797e41ace783f"},
+    {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939328fd539b8d0edf244327398a667b6b140afd3bf7e347cf9813c736211896"},
+    {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b48d3d634bca23b172f47f2335c617d3fcb4b3ba18481c96b7943a4c634f5c8d"},
+    {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f0b7628fb8efe60fe66fd4adadd7ad2304014770cdc1f4934db41fe46cc8825f"},
+    {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e1aa5c2410769ca28aa9a7841b80d9d9a1c5f223928ca8bec7e7c9a34d26b1d4"},
+    {file = "pydantic-1.10.9-cp310-cp310-win_amd64.whl", hash = "sha256:eec39224b2b2e861259d6f3c8b6290d4e0fbdce147adb797484a42278a1a486f"},
+    {file = "pydantic-1.10.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d111a21bbbfd85c17248130deac02bbd9b5e20b303338e0dbe0faa78330e37e0"},
+    {file = "pydantic-1.10.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e9aec8627a1a6823fc62fb96480abe3eb10168fd0d859ee3d3b395105ae19a7"},
+    {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07293ab08e7b4d3c9d7de4949a0ea571f11e4557d19ea24dd3ae0c524c0c334d"},
+    {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee829b86ce984261d99ff2fd6e88f2230068d96c2a582f29583ed602ef3fc2c"},
+    {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4b466a23009ff5cdd7076eb56aca537c745ca491293cc38e72bf1e0e00de5b91"},
+    {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7847ca62e581e6088d9000f3c497267868ca2fa89432714e21a4fb33a04d52e8"},
+    {file = "pydantic-1.10.9-cp311-cp311-win_amd64.whl", hash = "sha256:7845b31959468bc5b78d7b95ec52fe5be32b55d0d09983a877cca6aedc51068f"},
+    {file = "pydantic-1.10.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:517a681919bf880ce1dac7e5bc0c3af1e58ba118fd774da2ffcd93c5f96eaece"},
+    {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67195274fd27780f15c4c372f4ba9a5c02dad6d50647b917b6a92bf00b3d301a"},
+    {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2196c06484da2b3fded1ab6dbe182bdabeb09f6318b7fdc412609ee2b564c49a"},
+    {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6257bb45ad78abacda13f15bde5886efd6bf549dd71085e64b8dcf9919c38b60"},
+    {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3283b574b01e8dbc982080d8287c968489d25329a463b29a90d4157de4f2baaf"},
+    {file = "pydantic-1.10.9-cp37-cp37m-win_amd64.whl", hash = "sha256:5f8bbaf4013b9a50e8100333cc4e3fa2f81214033e05ac5aa44fa24a98670a29"},
+    {file = "pydantic-1.10.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9cd67fb763248cbe38f0593cd8611bfe4b8ad82acb3bdf2b0898c23415a1f82"},
+    {file = "pydantic-1.10.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f50e1764ce9353be67267e7fd0da08349397c7db17a562ad036aa7c8f4adfdb6"},
+    {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73ef93e5e1d3c8e83f1ff2e7fdd026d9e063c7e089394869a6e2985696693766"},
+    {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128d9453d92e6e81e881dd7e2484e08d8b164da5507f62d06ceecf84bf2e21d3"},
+    {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ad428e92ab68798d9326bb3e5515bc927444a3d71a93b4a2ca02a8a5d795c572"},
+    {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fab81a92f42d6d525dd47ced310b0c3e10c416bbfae5d59523e63ea22f82b31e"},
+    {file = "pydantic-1.10.9-cp38-cp38-win_amd64.whl", hash = "sha256:963671eda0b6ba6926d8fc759e3e10335e1dc1b71ff2a43ed2efd6996634dafb"},
+    {file = "pydantic-1.10.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:970b1bdc6243ef663ba5c7e36ac9ab1f2bfecb8ad297c9824b542d41a750b298"},
+    {file = "pydantic-1.10.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7e1d5290044f620f80cf1c969c542a5468f3656de47b41aa78100c5baa2b8276"},
+    {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83fcff3c7df7adff880622a98022626f4f6dbce6639a88a15a3ce0f96466cb60"},
+    {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0da48717dc9495d3a8f215e0d012599db6b8092db02acac5e0d58a65248ec5bc"},
+    {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0a2aabdc73c2a5960e87c3ffebca6ccde88665616d1fd6d3db3178ef427b267a"},
+    {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9863b9420d99dfa9c064042304868e8ba08e89081428a1c471858aa2af6f57c4"},
+    {file = "pydantic-1.10.9-cp39-cp39-win_amd64.whl", hash = "sha256:e7c9900b43ac14110efa977be3da28931ffc74c27e96ee89fbcaaf0b0fe338e1"},
+    {file = "pydantic-1.10.9-py3-none-any.whl", hash = "sha256:6cafde02f6699ce4ff643417d1a9223716ec25e228ddc3b436fe7e2d25a1f305"},
+    {file = "pydantic-1.10.9.tar.gz", hash = "sha256:95c70da2cd3b6ddf3b9645ecaa8d98f3d80c606624b6d245558d202cd23ea3be"},
+]
+
+[package.dependencies]
+typing-extensions = ">=4.2.0"
+
+[package.extras]
+dotenv = ["python-dotenv (>=0.10.4)"]
+email = ["email-validator (>=1.0.3)"]
+
 [[package]]
 name = "pytest"
 version = "7.3.2"
@@ -84,6 +595,27 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
+[[package]]
+name = "requests"
+version = "2.31.0"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
+    {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
 [[package]]
 name = "tomli"
 version = "2.0.1"
@@ -95,7 +627,142 @@ files = [
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
 
+[[package]]
+name = "tqdm"
+version = "4.65.0"
+description = "Fast, Extensible Progress Meter"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"},
+    {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
+[[package]]
+name = "typing-extensions"
+version = "4.6.3"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"},
+    {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"},
+]
+
+[[package]]
+name = "urllib3"
+version = "2.0.3"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"},
+    {file = "urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"},
+]
+
+[package.extras]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[[package]]
+name = "yarl"
+version = "1.9.2"
+description = "Yet another URL library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"},
+    {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"},
+    {file = "yarl-1.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9"},
+    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee4afac41415d52d53a9833ebae7e32b344be72835bbb589018c9e938045a560"},
+    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9bf345c3a4f5ba7f766430f97f9cc1320786f19584acc7086491f45524a551ac"},
+    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a96c19c52ff442a808c105901d0bdfd2e28575b3d5f82e2f5fd67e20dc5f4ea"},
+    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:891c0e3ec5ec881541f6c5113d8df0315ce5440e244a716b95f2525b7b9f3608"},
+    {file = "yarl-1.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c3a53ba34a636a256d767c086ceb111358876e1fb6b50dfc4d3f4951d40133d5"},
+    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:566185e8ebc0898b11f8026447eacd02e46226716229cea8db37496c8cdd26e0"},
+    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:2b0738fb871812722a0ac2154be1f049c6223b9f6f22eec352996b69775b36d4"},
+    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:32f1d071b3f362c80f1a7d322bfd7b2d11e33d2adf395cc1dd4df36c9c243095"},
+    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e9fdc7ac0d42bc3ea78818557fab03af6181e076a2944f43c38684b4b6bed8e3"},
+    {file = "yarl-1.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56ff08ab5df8429901ebdc5d15941b59f6253393cb5da07b4170beefcf1b2528"},
+    {file = "yarl-1.9.2-cp310-cp310-win32.whl", hash = "sha256:8ea48e0a2f931064469bdabca50c2f578b565fc446f302a79ba6cc0ee7f384d3"},
+    {file = "yarl-1.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:50f33040f3836e912ed16d212f6cc1efb3231a8a60526a407aeb66c1c1956dde"},
+    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:646d663eb2232d7909e6601f1a9107e66f9791f290a1b3dc7057818fe44fc2b6"},
+    {file = "yarl-1.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aff634b15beff8902d1f918012fc2a42e0dbae6f469fce134c8a0dc51ca423bb"},
+    {file = "yarl-1.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a83503934c6273806aed765035716216cc9ab4e0364f7f066227e1aaea90b8d0"},
+    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b25322201585c69abc7b0e89e72790469f7dad90d26754717f3310bfe30331c2"},
+    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22a94666751778629f1ec4280b08eb11815783c63f52092a5953faf73be24191"},
+    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ec53a0ea2a80c5cd1ab397925f94bff59222aa3cf9c6da938ce05c9ec20428d"},
+    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:159d81f22d7a43e6eabc36d7194cb53f2f15f498dbbfa8edc8a3239350f59fe7"},
+    {file = "yarl-1.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:832b7e711027c114d79dffb92576acd1bd2decc467dec60e1cac96912602d0e6"},
+    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:95d2ecefbcf4e744ea952d073c6922e72ee650ffc79028eb1e320e732898d7e8"},
+    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d4e2c6d555e77b37288eaf45b8f60f0737c9efa3452c6c44626a5455aeb250b9"},
+    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:783185c75c12a017cc345015ea359cc801c3b29a2966c2655cd12b233bf5a2be"},
+    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:b8cc1863402472f16c600e3e93d542b7e7542a540f95c30afd472e8e549fc3f7"},
+    {file = "yarl-1.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:822b30a0f22e588b32d3120f6d41e4ed021806418b4c9f0bc3048b8c8cb3f92a"},
+    {file = "yarl-1.9.2-cp311-cp311-win32.whl", hash = "sha256:a60347f234c2212a9f0361955007fcf4033a75bf600a33c88a0a8e91af77c0e8"},
+    {file = "yarl-1.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:be6b3fdec5c62f2a67cb3f8c6dbf56bbf3f61c0f046f84645cd1ca73532ea051"},
+    {file = "yarl-1.9.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38a3928ae37558bc1b559f67410df446d1fbfa87318b124bf5032c31e3447b74"},
+    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac9bb4c5ce3975aeac288cfcb5061ce60e0d14d92209e780c93954076c7c4367"},
+    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3da8a678ca8b96c8606bbb8bfacd99a12ad5dd288bc6f7979baddd62f71c63ef"},
+    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13414591ff516e04fcdee8dc051c13fd3db13b673c7a4cb1350e6b2ad9639ad3"},
+    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf74d08542c3a9ea97bb8f343d4fcbd4d8f91bba5ec9d5d7f792dbe727f88938"},
+    {file = "yarl-1.9.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e7221580dc1db478464cfeef9b03b95c5852cc22894e418562997df0d074ccc"},
+    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:494053246b119b041960ddcd20fd76224149cfea8ed8777b687358727911dd33"},
+    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:52a25809fcbecfc63ac9ba0c0fb586f90837f5425edfd1ec9f3372b119585e45"},
+    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e65610c5792870d45d7b68c677681376fcf9cc1c289f23e8e8b39c1485384185"},
+    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:1b1bba902cba32cdec51fca038fd53f8beee88b77efc373968d1ed021024cc04"},
+    {file = "yarl-1.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:662e6016409828ee910f5d9602a2729a8a57d74b163c89a837de3fea050c7582"},
+    {file = "yarl-1.9.2-cp37-cp37m-win32.whl", hash = "sha256:f364d3480bffd3aa566e886587eaca7c8c04d74f6e8933f3f2c996b7f09bee1b"},
+    {file = "yarl-1.9.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6a5883464143ab3ae9ba68daae8e7c5c95b969462bbe42e2464d60e7e2698368"},
+    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5610f80cf43b6202e2c33ba3ec2ee0a2884f8f423c8f4f62906731d876ef4fac"},
+    {file = "yarl-1.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9a4e67ad7b646cd6f0938c7ebfd60e481b7410f574c560e455e938d2da8e0f4"},
+    {file = "yarl-1.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:83fcc480d7549ccebe9415d96d9263e2d4226798c37ebd18c930fce43dfb9574"},
+    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fcd436ea16fee7d4207c045b1e340020e58a2597301cfbcfdbe5abd2356c2fb"},
+    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:84e0b1599334b1e1478db01b756e55937d4614f8654311eb26012091be109d59"},
+    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3458a24e4ea3fd8930e934c129b676c27452e4ebda80fbe47b56d8c6c7a63a9e"},
+    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:838162460b3a08987546e881a2bfa573960bb559dfa739e7800ceeec92e64417"},
+    {file = "yarl-1.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4e2d08f07a3d7d3e12549052eb5ad3eab1c349c53ac51c209a0e5991bbada78"},
+    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:de119f56f3c5f0e2fb4dee508531a32b069a5f2c6e827b272d1e0ff5ac040333"},
+    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:149ddea5abf329752ea5051b61bd6c1d979e13fbf122d3a1f9f0c8be6cb6f63c"},
+    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:674ca19cbee4a82c9f54e0d1eee28116e63bc6fd1e96c43031d11cbab8b2afd5"},
+    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:9b3152f2f5677b997ae6c804b73da05a39daa6a9e85a512e0e6823d81cdad7cc"},
+    {file = "yarl-1.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:5415d5a4b080dc9612b1b63cba008db84e908b95848369aa1da3686ae27b6d2b"},
+    {file = "yarl-1.9.2-cp38-cp38-win32.whl", hash = "sha256:f7a3d8146575e08c29ed1cd287068e6d02f1c7bdff8970db96683b9591b86ee7"},
+    {file = "yarl-1.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:63c48f6cef34e6319a74c727376e95626f84ea091f92c0250a98e53e62c77c72"},
+    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:75df5ef94c3fdc393c6b19d80e6ef1ecc9ae2f4263c09cacb178d871c02a5ba9"},
+    {file = "yarl-1.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c027a6e96ef77d401d8d5a5c8d6bc478e8042f1e448272e8d9752cb0aff8b5c8"},
+    {file = "yarl-1.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f3b078dbe227f79be488ffcfc7a9edb3409d018e0952cf13f15fd6512847f3f7"},
+    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:59723a029760079b7d991a401386390c4be5bfec1e7dd83e25a6a0881859e716"},
+    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b03917871bf859a81ccb180c9a2e6c1e04d2f6a51d953e6a5cdd70c93d4e5a2a"},
+    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c1012fa63eb6c032f3ce5d2171c267992ae0c00b9e164efe4d73db818465fac3"},
+    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a74dcbfe780e62f4b5a062714576f16c2f3493a0394e555ab141bf0d746bb955"},
+    {file = "yarl-1.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c56986609b057b4839968ba901944af91b8e92f1725d1a2d77cbac6972b9ed1"},
+    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2c315df3293cd521033533d242d15eab26583360b58f7ee5d9565f15fee1bef4"},
+    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:b7232f8dfbd225d57340e441d8caf8652a6acd06b389ea2d3222b8bc89cbfca6"},
+    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:53338749febd28935d55b41bf0bcc79d634881195a39f6b2f767870b72514caf"},
+    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:066c163aec9d3d073dc9ffe5dd3ad05069bcb03fcaab8d221290ba99f9f69ee3"},
+    {file = "yarl-1.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8288d7cd28f8119b07dd49b7230d6b4562f9b61ee9a4ab02221060d21136be80"},
+    {file = "yarl-1.9.2-cp39-cp39-win32.whl", hash = "sha256:b124e2a6d223b65ba8768d5706d103280914d61f5cae3afbc50fc3dfcc016623"},
+    {file = "yarl-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18"},
+    {file = "yarl-1.9.2.tar.gz", hash = "sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571"},
+]
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c5b989915c413ab901c39dd0c4f3b0fe203558c2879952a2460a52bda4f3e857"
+content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38"
diff --git a/pyproject.toml b/pyproject.toml
index 2c099a5b8..f88821cf2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,10 +5,15 @@ description = "Benchmarking the performance of agents far and wide, regardless o
 authors = ["Silen Naihin <silen.naihin@gmail.com>"]
 license = "MIT"
 readme = "README.md"
+packages = [{include = "agbenchmark"}]
 
 [tool.poetry.dependencies]
 python = "^3.9"
 pytest = "^7.3.2"
+click = "^8.1.3"
+requests = "^2.31.0"
+openai = "^0.27.8"
+pydantic = "^1.10.9"
 
 
 [build-system]
@@ -19,5 +24,11 @@ build-backend = "poetry.core.masonry.api"
 minversion = "6.0"
 addopts = "-ra -q"
 testpaths = [
-    "tests", "benchmark/challenges",
+    "tests", "agbenchmark",
 ]
+markers = [
+    "retrieval",
+]
+
+[tool.poetry.scripts]
+agbenchmark = "agbenchmark.start_benchmark:cli"
diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/test_api.py b/tests/test_api.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tests/test_workspace_manager.py b/tests/test_workspace_manager.py
deleted file mode 100644
index e69de29bb..000000000
-- 
cgit v1.2.3


From e5974ca3ea5e3c781f12b66805dea5f1db15d75c Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Wed, 21 Jun 2023 11:44:59 -0400
Subject: Delete file_to_check.txt

---
 file_to_check.txt | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 file_to_check.txt

diff --git a/file_to_check.txt b/file_to_check.txt
deleted file mode 100644
index 29afa8611..000000000
--- a/file_to_check.txt
+++ /dev/null
@@ -1 +0,0 @@
-The capital of America is Washington, D.C.
\ No newline at end of file
-- 
cgit v1.2.3


From 15c5469bb1aabf291864b5ba11981948b7b64fb2 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 22 Jun 2023 08:18:22 -0400
Subject: Add automatic regression markers (#38)

---
 README.md                                         |  2 +-
 agbenchmark/conftest.py                           | 33 ++++++++++++
 agbenchmark/start_benchmark.py                    | 64 +++++++++++++++--------
 agbenchmark/tests/regression/RegressionManager.py | 22 ++++++++
 agbenchmark/tests/regression/regression_tests.txt |  0
 pyproject.toml                                    |  1 +
 6 files changed, 99 insertions(+), 23 deletions(-)
 create mode 100644 agbenchmark/tests/regression/RegressionManager.py
 create mode 100644 agbenchmark/tests/regression/regression_tests.txt

diff --git a/README.md b/README.md
index 216f1202c..b46562d2d 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Share your progress :)
 
 1. `pip install auto-gpt-benchmarks`
 2. Add boilerplate code to start webserver to your agent (run loop and stop condition)
-3. `agbenchmark start --challenge challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory
+3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory
 4. We call the server to run the agent for each test
 5. Show pass rate of tests, logs, and any other metrics
 
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index b3ca086d8..55f5ca82d 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -3,6 +3,7 @@ import os
 import pytest
 import shutil
 from agbenchmark.mocks.tests.retrieval_manual import mock_retrieval
+from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
 
 
@@ -41,3 +42,35 @@ def server_response(request, config):
     #     response.status_code == 200
     # ), f"Request failed with status code {response.status_code}"
     mock_retrieval(task, config["workspace"])
+
+
+regression_txt = "agbenchmark/tests/regression/regression_tests.txt"
+
+regression_manager = RegressionManager(regression_txt)
+
+
+def pytest_runtest_makereport(item, call):
+    """Called for each test report. Generated for each stage
+    of a test run (setup, call, teardown)."""
+    if call.when == "call":
+        if (
+            call.excinfo is None
+        ):  # if no error in the call stage, add it as a regression test
+            regression_manager.add_test(item.nodeid)
+        else:  # otherwise, :(
+            regression_manager.remove_test(item.nodeid)
+
+
+def pytest_collection_modifyitems(items):
+    """Called once all test items are collected. Used
+    to add regression marker to collected test items."""
+    for item in items:
+        print("pytest_collection_modifyitems", item.nodeid)
+        if item.nodeid + "\n" in regression_manager.tests:
+            print(regression_manager.tests)
+            item.add_marker(pytest.mark.regression)
+
+
+def pytest_sessionfinish():
+    """Called at the end of the session to save regression tests"""
+    regression_manager.save()
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 79f308435..b7a116ebc 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -10,38 +10,58 @@ def cli():
 
 
 @cli.command()
-@click.option("--challenge", default=None, help="Specific challenge to run")
-def start(challenge):
-    """Start the benchmark tests. If a challenge flag is is provided, run the challenges with that mark."""
-    with open("agbenchmark/config.json", "r") as f:
-        config = json.load(f)
+@click.option("--category", default=None, help="Specific category to run")
+@click.option("--noreg", is_flag=True, help="Skip regression tests")
+def start(category, noreg):
+    """Start the benchmark tests. If a category flag is is provided, run the categories with that mark."""
+    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
+    config_file = "agbenchmark/config.json"
 
-    print("Current configuration:")
-    for key, value in config.items():
-        print(f"{key}: {value}")
+    # Check if configuration file exists and is not empty
+    if not os.path.exists(config_file) or os.stat(config_file).st_size == 0:
+        config = {}
 
-    update_config = click.confirm(
-        "\nDo you want to update these parameters?", default=False
-    )
-    if update_config:
         config["hostname"] = click.prompt(
-            "\nPlease enter a new hostname", default=config["hostname"]
+            "\nPlease enter a new hostname", default="localhost"
         )
-        config["port"] = click.prompt("Please enter a new port", default=config["port"])
+        config["port"] = click.prompt("Please enter a new port", default=8080)
         config["workspace"] = click.prompt(
-            "Please enter a new workspace path", default=config["workspace"]
+            "Please enter a new workspace path", default="/path/to/workspace"
         )
 
-        with open("agbenchmark/config.json", "w") as f:
+        with open(config_file, "w") as f:
             json.dump(config, f)
+    else:
+        # If the configuration file exists and is not empty, load it
+        with open(config_file, "r") as f:
+            config = json.load(f)
 
-    print("Starting benchmark tests...", challenge)
-    if challenge:
-        print(f"Running {challenge} challenges")
-        pytest.main(["agbenchmark", "-m", challenge, "-vs"])
+    print("Current configuration:")
+    for key, value in config.items():
+        print(f"{key}: {value}")
+
+    print("Starting benchmark tests...", category)
+    pytest_args = ["agbenchmark", "-vs"]
+    if category:
+        pytest_args.extend(
+            ["-m", category]
+        )  # run categorys that are of a specific marker
+        if noreg:
+            pytest_args.extend(
+                ["-k", "not regression"]
+            )  # run categorys that are of a specific marker but don't include regression categorys
+        print(f"Running {'non-regression' + category if noreg else category} categorys")
     else:
-        print("Running all challenges")
-        pytest.main(["agbenchmark", "-vs"])
+        if noreg:
+            print("Running all non-regression categorys")
+            pytest_args.extend(
+                ["-k", "not regression"]
+            )  # run categorys that are not regression categorys
+        else:
+            print("Running all categorys")  # run all categorys
+
+    # Run pytest with the constructed arguments
+    pytest.main(pytest_args)
 
 
 if __name__ == "__main__":
diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py
new file mode 100644
index 000000000..9117d53f1
--- /dev/null
+++ b/agbenchmark/tests/regression/RegressionManager.py
@@ -0,0 +1,22 @@
+class RegressionManager:
+    """Abstracts interaction with the regression tests file"""
+
+    def __init__(self, filename: str):
+        self.filename = filename
+        self.load()
+
+    def load(self) -> None:
+        with open(self.filename, "r") as f:
+            self.tests = f.readlines()
+
+    def save(self) -> None:
+        with open(self.filename, "w") as f:
+            f.writelines(self.tests)
+
+    def add_test(self, test_id) -> None:
+        if f"{test_id}\n" not in self.tests:
+            self.tests.append(f"{test_id}\n")
+
+    def remove_test(self, test_id) -> None:
+        if f"{test_id}\n" in self.tests:
+            self.tests.remove(f"{test_id}\n")
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/pyproject.toml b/pyproject.toml
index f88821cf2..5498381a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ testpaths = [
 ]
 markers = [
     "retrieval",
+    "regression"
 ]
 
 [tool.poetry.scripts]
-- 
cgit v1.2.3


From ffd1d15a0e32d608304f4e356eff2fbc306b3007 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 23 Jun 2023 07:53:57 -0400
Subject: MockManager, mock_func in data.json (#39)

---
 agbenchmark/challenges/README.md                 |  3 +-
 agbenchmark/challenges/define_task_types.py      |  1 +
 agbenchmark/challenges/retrieval/r1/r1_data.json |  3 +-
 agbenchmark/challenges/retrieval/r1/r1_test.py   |  2 +-
 agbenchmark/conftest.py                          | 40 ++++++++++++++++++------
 agbenchmark/mocks/MockManager.py                 | 28 +++++++++++++++++
 agbenchmark/mocks/tests/basic_mocks.py           |  0
 agbenchmark/mocks/tests/retrieval_manual.py      | 10 ------
 agbenchmark/mocks/tests/retrieval_mocks.py       | 13 ++++++++
 9 files changed, 77 insertions(+), 23 deletions(-)
 create mode 100644 agbenchmark/mocks/MockManager.py
 create mode 100644 agbenchmark/mocks/tests/basic_mocks.py
 delete mode 100644 agbenchmark/mocks/tests/retrieval_manual.py
 create mode 100644 agbenchmark/mocks/tests/retrieval_mocks.py

diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 40281c99b..50efe2c4d 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -17,8 +17,9 @@ Input:
   - **answer** (str): The raw text of ground truth answer
   - **should_contain** (list): the exact strings that is required in the final answer
   - **should_not_contain** (list): the exact strings that should not be in the final answer
-  - **files**: files that the are used for retrieval
+  - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
 - **difficulty**(str): the difficulty of this query. choices from
+- **mock_func**: function to mock the agent's response. This is used for testing purposes
 
 Example:
 
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 94b5ba533..f1a841b53 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -16,6 +16,7 @@ class Challenge(BaseModel):
     task: str
     ground: Ground
     difficulty: str
+    mock_func: Optional[str] = None
 
     def serialize(self, path: str) -> None:
         with open(path, "w") as file:
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index b5d5701ea..c7cc31004 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -7,5 +7,6 @@
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy"
+  "difficulty": "easy",
+  "mock_func": "retrieval_1_mock"
 }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 195de15f8..e20c9f7b9 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -11,7 +11,7 @@ class TestRetrieval1(RetrievalChallenge):
 
     @pytest.mark.parametrize(
         "server_response",
-        [data.task],
+        [(data.task, data.mock_func)],
         indirect=True,
     )
     @pytest.mark.retrieval
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 55f5ca82d..908d39e89 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,9 +2,10 @@ import json
 import os
 import pytest
 import shutil
-from agbenchmark.mocks.tests.retrieval_manual import mock_retrieval
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
+from requests.exceptions import RequestException
+from agbenchmark.mocks.MockManager import MockManager
 
 
 @pytest.fixture(scope="module")
@@ -33,15 +34,34 @@ def workspace(config):
 
 @pytest.fixture(autouse=True)
 def server_response(request, config):
-    task = request.param  # The task is passed in indirectly
-    print(f"Server starting at {request.module}")
-    # response = requests.post(
-    #     f"{config['hostname']}:{config['port']}", data={"task": task}
-    # )
-    # assert (
-    #     response.status_code == 200
-    # ), f"Request failed with status code {response.status_code}"
-    mock_retrieval(task, config["workspace"])
+    """Calling to get a response"""
+    if isinstance(request.param, tuple):
+        task = request.param[0]  # The task is passed in indirectly
+        mock_function_name = request.param[1]
+    else:
+        task = request.param
+        mock_function_name = None
+    # print(f"Server starting at {request.module}")
+    # try:
+    #     response = requests.post(
+    #         f"{config['hostname']}:{config['port']}", data={"task": task}
+    #     )
+    #     response.raise_for_status()  # This will raise an HTTPError if the status is 4xx or 5xx
+    # except RequestException:
+    #     # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock
+
+    if mock_function_name:
+        mock_manager = MockManager(
+            task
+        )  # workspace doesn't need to be passed in, stays the same
+        print("Server unavailable, using mock", mock_function_name)
+        mock_manager.delegate(mock_function_name)
+    else:
+        print("No mock provided")
+
+    # else:
+    #     # This code is run if no exception occurred
+    #     print(f"Request succeeded with status code {response.status_code}")
 
 
 regression_txt = "agbenchmark/tests/regression/regression_tests.txt"
diff --git a/agbenchmark/mocks/MockManager.py b/agbenchmark/mocks/MockManager.py
new file mode 100644
index 000000000..f4e7f5f5a
--- /dev/null
+++ b/agbenchmark/mocks/MockManager.py
@@ -0,0 +1,28 @@
+import sys
+import agbenchmark.mocks.tests.basic_mocks as basic_mocks
+import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
+
+
+class MockManager:
+    def __init__(self, task: str):
+        self.task = task
+        self.workspace = "agbenchmark/mocks/workspace"
+        self.modules = [basic_mocks, retrieval_mocks]
+
+    def delegate(self, mock_function_name, *args, **kwargs):
+        if hasattr(self, mock_function_name):
+            # Check if the mock function is an attribute of this class
+            getattr(self, mock_function_name)(*args, **kwargs)
+        elif mock_function_name in globals():
+            # Check if the function is imported in the file
+            func = globals()[mock_function_name]
+            func(self.task, self.workspace, *args, **kwargs)
+        elif len(self.modules) > 0:
+            # checks if function is in imported modules
+            for module in self.modules:
+                if hasattr(module, mock_function_name):
+                    func = getattr(module, mock_function_name)
+                    func(self.task, self.workspace, *args, **kwargs)
+                    return
+        else:
+            raise ValueError(f"No such mock: {mock_function_name}")
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/mocks/tests/retrieval_manual.py b/agbenchmark/mocks/tests/retrieval_manual.py
deleted file mode 100644
index ccb482132..000000000
--- a/agbenchmark/mocks/tests/retrieval_manual.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from ..basic_gpt_agent import basic_gpt_agent
-from agbenchmark.Challenge import Challenge
-
-
-def mock_retrieval(task: str, workspace: str):
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py
new file mode 100644
index 000000000..23f4bde17
--- /dev/null
+++ b/agbenchmark/mocks/tests/retrieval_mocks.py
@@ -0,0 +1,13 @@
+from ..basic_gpt_agent import basic_gpt_agent
+from agbenchmark.Challenge import Challenge
+
+
+# TODO: Make it so that you can specify for tests to only run if their prerequisites are met.
+# Prerequisites here would be writing to a file (basic_abilities test).
+# Should also check if prerequisites exists in regression file
+def retrieval_1_mock(task: str, workspace: str):
+    # Call the basic_gpt_agent to get a response.
+    response = basic_gpt_agent(task)
+
+    # Open the file in write mode.
+    Challenge.write_to_file(workspace, "file_to_check.txt", response)
-- 
cgit v1.2.3


From b6562f3420bd6a77d415d8d57d3a1c9a4f9ed354 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 23 Jun 2023 09:31:21 -0400
Subject: Update README.md

---
 README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b46562d2d..0a8d119af 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,16 @@ A repo built for the purpose of benchmarking the performance of agents far and w
 
 ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
 
-## Contributing
+### To run the basic existing mock (June 21)
 
-- Make sure you have `poetry` installed - `pip install poetry`.
-- Then `poetry install` for dependencies
+1. clone the repo `auto-gpt-benchmarks`
+2. `pip install poetry`
+3. `poetry shell`
+4. `poetry install`
+5. `agbenchmark start`
+   Keep config the same and watch the logs :)
 
 - To add requirements `poetry add requirement`.
-- To run in venv `poetry run python script.py`
 
 Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
 
-- 
cgit v1.2.3


From a5073ab57790a84d146877e1b6512eecbfc12b09 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 09:42:36 -0400
Subject: basic challenges, more ChallengeData structure

---
 agbenchmark/Challenge.py                           | 22 ++++++++++++++++
 agbenchmark/challenges/define_task_types.py        | 16 ++++++++----
 agbenchmark/challenges/retrieval/Retrieval.py      | 22 +---------------
 agbenchmark/challenges/retrieval/r1/r1_data.json   | 10 +++++---
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  6 +++--
 agbenchmark/mocks/tests/basic_mocks.py             | 28 +++++++++++++++++++++
 agbenchmark/mocks/tests/retrieval_mocks.py         |  7 +-----
 .../basic_abilities/read_file/r_file_data.json     | 15 +++++++++++
 .../basic_abilities/read_file/read_file_test.py    | 29 ++++++++++++++++++++++
 .../tests/basic_abilities/read_file_test.py        |  0
 .../basic_abilities/write_file/w_file_data.json    | 16 ++++++++++++
 .../basic_abilities/write_file/write_file_test.py  | 27 ++++++++++++++++++++
 .../tests/basic_abilities/write_file_test.py       |  0
 pyproject.toml                                     |  3 ++-
 14 files changed, 163 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 20bf55853..9828a0e9e 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,6 @@
 import os
 from typing import Optional
+from agbenchmark.challenges.define_task_types import Ground
 
 
 class Challenge:
@@ -30,3 +31,24 @@ class Challenge:
             for filename in os.listdir(workspace)
             if os.path.isfile(os.path.join(workspace, filename))
         ]
+
+    def scoring(self, content: str, ground: Ground):
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if should_contain_word not in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should exist: {should_contain_word} exists in the content"
+                    )
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if should_not_contain_word in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                    )
+
+        return 1.0
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index f1a841b53..879a46af0 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,12 @@ import json
 import os
 
 
+class Info(BaseModel):
+    difficulty: str
+    description: str
+    side_effects: List[str]
+
+
 class Ground(BaseModel):
     answer: str
     should_contain: Optional[List[str]]
@@ -11,20 +17,20 @@ class Ground(BaseModel):
     files: List[str]
 
 
-class Challenge(BaseModel):
-    category: str
+class ChallengeData(BaseModel):
+    category: List[str]
     task: str
     ground: Ground
-    difficulty: str
     mock_func: Optional[str] = None
+    info: Info
 
     def serialize(self, path: str) -> None:
         with open(path, "w") as file:
             file.write(self.json())
 
     @staticmethod
-    def deserialize(path: str) -> "Challenge":
+    def deserialize(path: str) -> "ChallengeData":
         print("Deserializing", path)
         with open(path, "r") as file:
             data = json.load(file)
-        return Challenge(**data)
+        return ChallengeData(**data)
diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 2db22ae4d..9434d69c3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,27 +1,7 @@
 from agbenchmark.Challenge import Challenge
-from agbenchmark.challenges.define_task_types import Ground
 
 
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
-    def scoring(self, content: str, ground: Ground):
-        if ground.should_contain:
-            for should_contain_word in ground.should_contain:
-                if should_contain_word not in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
-
-        if ground.should_not_contain:
-            for should_not_contain_word in ground.should_not_contain:
-                if should_not_contain_word in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
-                    )
-
-        return 1.0
+    pass
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index c7cc31004..08b74d1b7 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,5 @@
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -7,6 +7,10 @@
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy",
-  "mock_func": "retrieval_1_mock"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index e20c9f7b9..d37c5e795 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,9 +1,11 @@
 import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
-from agbenchmark.challenges.define_task_types import Challenge, Ground
+from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
-data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json"))
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r1_data.json")
+)
 
 
 class TestRetrieval1(RetrievalChallenge):
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index e69de29bb..eb7b96541 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -0,0 +1,28 @@
+from agbenchmark.Challenge import Challenge
+from ..basic_gpt_agent import basic_gpt_agent
+
+
+def basic_read_file_mock(task: str, workspace: str):
+    """
+    This mock reads a file and returns its content.
+    """
+
+    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+
+    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
+
+    Challenge.write_to_file(
+        workspace, "file_to_check.txt", f"random string: {file_contents}"
+    )
+
+
+def basic_write_file_mock(task: str, workspace: str):
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+
+    # Call the basic_gpt_agent to get a response.
+    response = basic_gpt_agent(task)
+
+    # Open the file in write mode.
+    Challenge.write_to_file(workspace, "file_to_check.txt", response)
diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py
index 23f4bde17..2481de060 100644
--- a/agbenchmark/mocks/tests/retrieval_mocks.py
+++ b/agbenchmark/mocks/tests/retrieval_mocks.py
@@ -1,4 +1,3 @@
-from ..basic_gpt_agent import basic_gpt_agent
 from agbenchmark.Challenge import Challenge
 
 
@@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge
 # Prerequisites here would be writing to a file (basic_abilities test).
 # Should also check if prerequisites exists in regression file
 def retrieval_1_mock(task: str, workspace: str):
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
new file mode 100644
index 000000000..55319ddfc
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -0,0 +1,15 @@
+{
+  "category": ["basic"],
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "ground": {
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_read_file_mock",
+  "info": {
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
new file mode 100644
index 000000000..610ccdab6
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -0,0 +1,29 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
+)
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(
+        self, workspace
+    ):  # create_file simply there for the function to depend on the fixture
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
new file mode 100644
index 000000000..4aaa1347d
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -0,0 +1,16 @@
+{
+  "category": ["basic"],
+  "task": "What is the capital of America?",
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
new file mode 100644
index 000000000..ccb10fe70
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -0,0 +1,27 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "w_file_data.json")
+)
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(self, workspace):
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/pyproject.toml b/pyproject.toml
index 5498381a2..6f79e75ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,8 @@ testpaths = [
 ]
 markers = [
     "retrieval",
-    "regression"
+    "regression",
+    "basic"
 ]
 
 [tool.poetry.scripts]
-- 
cgit v1.2.3


From 66c9e68b0430066d23e9acd66e5259ea5d5190d7 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:15:53 -0400
Subject: file creation from within file before server :)

---
 agbenchmark/conftest.py                                       | 2 +-
 agbenchmark/mocks/tests/basic_mocks.py                        | 2 +-
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py | 8 ++++++++
 agbenchmark/tests/regression/regression_tests.txt             | 2 ++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 908d39e89..434f6dbde 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -17,7 +17,7 @@ def config():
     return config
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def workspace(config):
     yield config["workspace"]
     # teardown after test function completes
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index eb7b96541..bbff6a9c7 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
 
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 610ccdab6..35d1d80c5 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -8,6 +8,14 @@ data = ChallengeData.deserialize(
 )
 
 
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
+
+
 class TestReadFile(Challenge):
     """Testing if LLM can read a file"""
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index e69de29bb..a5f8fbd1d 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -0,0 +1,2 @@
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
-- 
cgit v1.2.3


From 4fa9f72083aa09bf1770f10a3254c4d0ef674a9a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:24:17 -0400
Subject: adding dependencies on other challenges

---
 agbenchmark/mocks/tests/basic_mocks.py                    |  2 --
 .../tests/basic_abilities/read_file/read_file_test.py     |  1 +
 .../tests/basic_abilities/write_file/write_file_test.py   |  1 +
 agbenchmark/tests/regression/regression_tests.txt         |  1 -
 poetry.lock                                               | 15 ++++++++++++++-
 pyproject.toml                                            |  1 +
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index bbff6a9c7..550095b72 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
-
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
     Challenge.write_to_file(
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 35d1d80c5..ea794281e 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,7 @@ class TestReadFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(depends=["write_file"])
     def test_retrieval(
         self, workspace
     ):  # create_file simply there for the function to depend on the fixture
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index ccb10fe70..b2c559c9e 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -17,6 +17,7 @@ class TestWriteFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(name="write_file")
     def test_retrieval(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index a5f8fbd1d..84e625af4 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,2 +1 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3f1059aaf..3bc37622e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
+[[package]]
+name = "pytest-dependency"
+version = "0.5.1"
+description = "Manage dependencies of tests"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
+]
+
+[package.dependencies]
+pytest = ">=3.6.0"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -765,4 +778,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38"
+content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
diff --git a/pyproject.toml b/pyproject.toml
index 6f79e75ce..087ac8447 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
+pytest-dependency = "^0.5.1"
 
 
 [build-system]
-- 
cgit v1.2.3


From f895d54e02c92e262172d9a773f7e6a4870d435d Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 14:42:35 -0400
Subject: more elegant marking & dependency solution

---
 README.md                                          | 74 +++++++++++++++++++---
 agbenchmark/challenges/README.md                   | 38 +++++------
 agbenchmark/challenges/define_task_types.py        |  1 +
 agbenchmark/challenges/retrieval/r1/r1_data.json   |  1 +
 .../tests/basic_abilities/BasicChallenge.py        |  7 ++
 .../basic_abilities/read_file/r_file_data.json     |  1 +
 .../basic_abilities/read_file/read_file_test.py    | 12 ++--
 .../basic_abilities/write_file/w_file_data.json    |  1 +
 .../basic_abilities/write_file/write_file_test.py  |  9 ++-
 agbenchmark/tests/regression/regression_tests.txt  |  2 +
 poetry.lock                                        | 17 ++++-
 pyproject.toml                                     |  1 +
 12 files changed, 126 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py

diff --git a/README.md b/README.md
index 0a8d119af..0ad0cf345 100644
--- a/README.md
+++ b/README.md
@@ -51,15 +51,73 @@ Share your progress :)
 
 to create a test:
 
-```
-@pytest.mark.parametrize(
-"server_response",
-["VARIABLE"], # VARIABLE = the query/goal you provide to the model
-indirect=True,
+```python
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from ..CategoryChallenge import CategoryChallenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
 )
-@pytest.mark.(VARIABLE) # VARIABLE = category of the test
-def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
-assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
+
+class TestSomething(CategoryChallenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    def test_retrieval(
+        self, workspace
+    ):
+        # scoring logic goes here
+```
+
+All challenges will inherit from parent class which has the mark
+
+```python
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
+```
+
+If you want to add a custom mark to a Challenge, you must specify it before the test definition
+
+```python
+@pytest.mark.other_mark
+def test_retrieval(self, workspace):
+```
+
+To add a dependency to a challenge use the following
+
+```python
+# to defining what a test depends on
+from pytest_dependency import depends
+
+def test1(self, request, workspace):
+   depends(request, data.dependencies)
+# for defining a test as a dependency
+@pytest.mark.dependency()
+def test2
+```
+
+Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
+
+```python
+@pytest.mark.run(order=1)
+```
+
+To create a file to test a challenge, add this to the challenge file which will create a file before running the server
+
+```python
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
 ```
 
 ## Api
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 50efe2c4d..d5229e937 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,28 +4,25 @@
 
 Input:
 
-- **category** (str): information-retrieval
-- **difficulty**(str): the difficulty of this query. choices from
-
-## Information-retrieval challenges
-
-Input:
-
-- **category** (str): information-retrieval
-- **task** (str): the question the agent needs to be solve.
+- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **task** (str): The task that the agent needs to solve.
+- **dependencies** (str[]): The dependencies that the challenge needs to run.
 - **ground** (dict): The ground truth.
-  - **answer** (str): The raw text of ground truth answer
-  - **should_contain** (list): the exact strings that is required in the final answer
-  - **should_not_contain** (list): the exact strings that should not be in the final answer
-  - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
-- **difficulty**(str): the difficulty of this query. choices from
-- **mock_func**: function to mock the agent's response. This is used for testing purposes
+  - **answer** (str): The raw text of the ground truth answer.
+  - **should_contain** (list): The exact strings that are required in the final answer.
+  - **should_not_contain** (list): The exact strings that should not be in the final answer.
+  - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
+- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **info** (dict): Additional info about the challenge.
+  - **difficulty** (str): The difficulty of this query.
+  - **description** (str): Description of the challenge.
+  - **side_effects** (str[]): Describes the effects of the challenge.
 
 Example:
 
 ```python
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -33,11 +30,16 @@ Example:
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
 
 ```
 
-Output:
+Current Output:
 
 - **score** (float): scores range from [0, 1]
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 879a46af0..694671218 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -20,6 +20,7 @@ class Ground(BaseModel):
 class ChallengeData(BaseModel):
     category: List[str]
     task: str
+    dependencies: List[str]
     ground: Ground
     mock_func: Optional[str] = None
     info: Info
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 08b74d1b7..fe05b6d51 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,6 @@
 {
   "category": ["basic"],
+  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
new file mode 100644
index 000000000..563207405
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -0,0 +1,7 @@
+import pytest
+from agbenchmark.Challenge import Challenge
+
+
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 55319ddfc..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ea794281e..03b2d6cab 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,7 +1,9 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
+from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -16,7 +18,7 @@ def setup_module(workspace):
         )
 
 
-class TestReadFile(Challenge):
+class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
     @pytest.mark.parametrize(
@@ -24,11 +26,9 @@ class TestReadFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(depends=["write_file"])
-    def test_retrieval(
-        self, workspace
-    ):  # create_file simply there for the function to depend on the fixture
+    def test_read_file(self, request, workspace):
+        depends(request, data.dependencies)
+
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 4aaa1347d..562d1c364 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b2c559c9e..b09162e3d 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,6 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
-from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
 data = ChallengeData.deserialize(
@@ -8,7 +8,7 @@ data = ChallengeData.deserialize(
 )
 
 
-class TestWriteFile(Challenge):
+class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
     @pytest.mark.parametrize(
@@ -16,9 +16,8 @@ class TestWriteFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(name="write_file")
-    def test_retrieval(self, workspace):
+    @pytest.mark.dependency()
+    def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 84e625af4..b831003fc 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1 +1,3 @@
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3bc37622e..f6f24c5f2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,21 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-ordering"
+version = "0.6"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
+    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
+    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -778,4 +793,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
+content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
diff --git a/pyproject.toml b/pyproject.toml
index 087ac8447..faee61c2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
+pytest-ordering = "^0.6"
 
 
 [build-system]
-- 
cgit v1.2.3


From d1c5e0a91a7a0f23b0e8de5f394204e96ec668cd Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 00:22:53 -0400
Subject: finally figured out right way to do dependencies

---
 agbenchmark/challenges/retrieval/Retrieval.py         |  2 ++
 agbenchmark/challenges/retrieval/r1/r1_data.json      |  4 ++--
 agbenchmark/challenges/retrieval/r1/r1_test.py        |  6 ++++--
 agbenchmark/tests/basic_abilities/BasicChallenge.py   |  1 +
 .../tests/basic_abilities/read_file/r_file_data.json  |  4 +++-
 .../tests/basic_abilities/read_file/read_file_test.py |  6 ++----
 .../basic_abilities/write_file/write_file_test.py     |  1 -
 agbenchmark/tests/regression/regression_tests.txt     |  4 ++--
 poetry.lock                                           | 19 ++++++++++++++++++-
 pyproject.toml                                        |  3 ++-
 10 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 9434d69c3..b8aa81ce3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,6 +1,8 @@
 from agbenchmark.Challenge import Challenge
+import pytest
 
 
+@pytest.mark.retrieval
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index fe05b6d51..562d1c364 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,14 +1,14 @@
 {
   "category": ["basic"],
-  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_write_file_mock",
   "info": {
     "difficulty": "easy",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index d37c5e795..5e6d6abf4 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,6 +2,8 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
+from pytest_dependency import depends
+
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r1_data.json")
@@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.retrieval
-    def test_retrieval(self, workspace):
+    def test_retrieval(self, request, workspace):
+        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 563207405..0cada86cc 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,6 +2,7 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
+@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 8c5ef62db..4d04f33e7 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,7 +1,9 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["test_write_file"],
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 03b2d6cab..ad08da4e0 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
-from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_read_file(self, request, workspace):
-        depends(request, data.dependencies)
-
+    @pytest.mark.order(after=data.dependencies)
+    def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b09162e3d..4c94320e0 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.dependency()
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index b831003fc..df27f3124 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
+agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index f6f24c5f2..4764bf493 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,23 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-order"
+version = "1.1.0"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
+    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
+]
+
+[package.dependencies]
+pytest = [
+    {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
+]
+
 [[package]]
 name = "pytest-ordering"
 version = "0.6"
@@ -793,4 +810,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
+content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
diff --git a/pyproject.toml b/pyproject.toml
index faee61c2d..fd2c52041 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
 pytest-ordering = "^0.6"
+pytest-order = "^1.1.0"
 
 
 [build-system]
@@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "-ra -q"
+addopts = "--order-dependencies" # -ra -q 
 testpaths = [
     "tests", "agbenchmark",
 ]
-- 
cgit v1.2.3


From 31c11927199714516891db5aa3044eb1a4396eb4 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 08:48:16 -0400
Subject: other was non solution, solution is pytest-depends

---
 agbenchmark/challenges/README.md                   | 20 +++---
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  2 -
 .../tests/basic_abilities/BasicChallenge.py        |  1 -
 .../basic_abilities/read_file/r_file_data.json     |  4 +-
 .../basic_abilities/read_file/read_file_test.py    |  2 +-
 .../basic_abilities/write_file/write_file_test.py  |  1 +
 agbenchmark/tests/regression/regression_tests.txt  |  2 +-
 poetry.lock                                        | 80 ++++++++++++----------
 pyproject.toml                                     |  6 +-
 9 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index d5229e937..e457b85c4 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -6,7 +6,7 @@ Input:
 
 - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
 - **task** (str): The task that the agent needs to solve.
-- **dependencies** (str[]): The dependencies that the challenge needs to run.
+- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
 - **ground** (dict): The ground truth.
   - **answer** (str): The raw text of the ground truth answer.
   - **should_contain** (list): The exact strings that are required in the final answer.
@@ -23,18 +23,20 @@ Example:
 ```python
 {
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
-    "answer": "Washington",
-    "should_contain": ["Washington"],
-    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_read_file_mock",
   "info": {
-    "difficulty": "easy",
-    "description": "Tests the writing to file",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
   }
 }
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 5e6d6abf4..45becaf75 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
-from pytest_dependency import depends
 
 
 data = ChallengeData.deserialize(
@@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, request, workspace):
-        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 0cada86cc..563207405 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
-@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 4d04f33e7..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,9 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": [
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
-  ],
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ad08da4e0..494a9b071 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.order(after=data.dependencies)
+    @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 4c94320e0..0a4ef4a2c 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index df27f3124..57b94cd7a 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 4764bf493..d7939fbfe 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -368,6 +368,20 @@ files = [
     {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
 ]
 
+[[package]]
+name = "future-fstrings"
+version = "1.2.0"
+description = "A backport of fstrings to python<3.6"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"},
+    {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"},
+]
+
+[package.extras]
+rewrite = ["tokenize-rt (>=3)"]
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -473,6 +487,24 @@ files = [
     {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
 ]
 
+[[package]]
+name = "networkx"
+version = "3.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
+    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
+developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
+doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
+test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
 [[package]]
 name = "openai"
 version = "0.27.8"
@@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
-name = "pytest-dependency"
-version = "0.5.1"
-description = "Manage dependencies of tests"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
-]
-
-[package.dependencies]
-pytest = ">=3.6.0"
-
-[[package]]
-name = "pytest-order"
-version = "1.1.0"
-description = "pytest plugin to run your tests in a specific order"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
-    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
-]
-
-[package.dependencies]
-pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
-]
-
-[[package]]
-name = "pytest-ordering"
-version = "0.6"
-description = "pytest plugin to run your tests in a specific order"
+name = "pytest-depends"
+version = "1.0.1"
+description = "Tests that depend on other tests"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
-    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
-    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
+    {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"},
+    {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"},
 ]
 
 [package.dependencies]
-pytest = "*"
+colorama = "*"
+future-fstrings = "*"
+networkx = "*"
+pytest = ">=3"
 
 [[package]]
 name = "requests"
@@ -810,4 +814,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
+content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
diff --git a/pyproject.toml b/pyproject.toml
index fd2c52041..0a4f8ba73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,9 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
-pytest-dependency = "^0.5.1"
-pytest-ordering = "^0.6"
-pytest-order = "^1.1.0"
+pytest-depends = "^1.0.1"
 
 
 [build-system]
@@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "--order-dependencies" # -ra -q 
+addopts = "-ra -q"
 testpaths = [
     "tests", "agbenchmark",
 ]
-- 
cgit v1.2.3


From adc6b225a6063bc2b0981f1156f25bde9279040e Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 11:12:33 -0400
Subject: update regression tests info

---
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  7 ++++-
 agbenchmark/conftest.py                            | 36 +++++++++++++++-------
 .../basic_abilities/read_file/read_file_test.py    |  5 +++
 .../basic_abilities/write_file/w_file_data.json    |  2 +-
 .../basic_abilities/write_file/write_file_test.py  |  5 +++
 agbenchmark/tests/regression/RegressionManager.py  | 25 +++++++++------
 agbenchmark/tests/regression/regression_tests.json |  1 +
 agbenchmark/tests/regression/regression_tests.txt  | 17 ++++++++--
 8 files changed, 73 insertions(+), 25 deletions(-)
 create mode 100644 agbenchmark/tests/regression/regression_tests.json

diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 45becaf75..489d298fb 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_retrieval(self, request, workspace):
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
+    def test_retrieval(self, workspace, current_challenge_data):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 434f6dbde..78114c204 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
+from agbenchmark.challenges.define_task_types import ChallengeData
 
 
 @pytest.fixture(scope="module")
@@ -64,21 +65,34 @@ def server_response(request, config):
     #     print(f"Request succeeded with status code {response.status_code}")
 
 
-regression_txt = "agbenchmark/tests/regression/regression_tests.txt"
+regression_json = "agbenchmark/tests/regression/regression_tests.json"
 
-regression_manager = RegressionManager(regression_txt)
+regression_manager = RegressionManager(regression_json)
+
+
+# this is to get the challenge_data from every test
+@pytest.fixture(autouse=True)
+def regression_data(request):
+    return request.param
 
 
 def pytest_runtest_makereport(item, call):
-    """Called for each test report. Generated for each stage
-    of a test run (setup, call, teardown)."""
     if call.when == "call":
-        if (
-            call.excinfo is None
-        ):  # if no error in the call stage, add it as a regression test
-            regression_manager.add_test(item.nodeid)
-        else:  # otherwise, :(
-            regression_manager.remove_test(item.nodeid)
+        challenge_data = item.funcargs.get("regression_data", None)
+        difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
+        dependencies = challenge_data.dependencies if challenge_data else []
+
+        test_details = {
+            "difficulty": difficulty,
+            "dependencies": dependencies,
+            "test": item.nodeid,
+        }
+
+        print("pytest_runtest_makereport", test_details)
+        if call.excinfo is None:
+            regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+        else:
+            regression_manager.remove_test(item.nodeid.split("::")[1])
 
 
 def pytest_collection_modifyitems(items):
@@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items):
     to add regression marker to collected test items."""
     for item in items:
         print("pytest_collection_modifyitems", item.nodeid)
-        if item.nodeid + "\n" in regression_manager.tests:
+        if item.nodeid.split("::")[1] in regression_manager.tests:
             print(regression_manager.tests)
             item.add_marker(pytest.mark.regression)
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 494a9b071..7d14228c8 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 562d1c364..1d2621081 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -10,7 +10,7 @@
   },
   "mock_func": "basic_write_file_mock",
   "info": {
-    "difficulty": "easy",
+    "difficulty": "basic",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 0a4ef4a2c..330128898 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py
index 9117d53f1..a1379ecae 100644
--- a/agbenchmark/tests/regression/RegressionManager.py
+++ b/agbenchmark/tests/regression/RegressionManager.py
@@ -1,3 +1,6 @@
+import json
+
+
 class RegressionManager:
     """Abstracts interaction with the regression tests file"""
 
@@ -6,17 +9,21 @@ class RegressionManager:
         self.load()
 
     def load(self) -> None:
-        with open(self.filename, "r") as f:
-            self.tests = f.readlines()
+        try:
+            with open(self.filename, "r") as f:
+                self.tests = json.load(f)
+        except (FileNotFoundError, json.decoder.JSONDecodeError):
+            self.tests = {}
 
     def save(self) -> None:
         with open(self.filename, "w") as f:
-            f.writelines(self.tests)
+            json.dump(self.tests, f, indent=4)
 
-    def add_test(self, test_id) -> None:
-        if f"{test_id}\n" not in self.tests:
-            self.tests.append(f"{test_id}\n")
+    def add_test(self, test_name: str, test_details: dict) -> None:
+        self.tests[test_name] = test_details
+        self.save()
 
-    def remove_test(self, test_id) -> None:
-        if f"{test_id}\n" in self.tests:
-            self.tests.remove(f"{test_id}\n")
+    def remove_test(self, test_name: str) -> None:
+        if test_name in self.tests:
+            del self.tests[test_name]
+            self.save()
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
new file mode 100644
index 000000000..9e26dfeeb
--- /dev/null
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 57b94cd7a..8af722f07 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,14 @@
-agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
+{
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
+        "difficulty": "easy",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 7604ae07bb6d79cfe8e5a28fdf3fa85c83603b1b Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 19:30:04 -0400
Subject: can now put file extensions or names in files data

---
 agbenchmark/Challenge.py                           | 22 +++++++++++++++++++++-
 agbenchmark/challenges/retrieval/r1/r1_test.py     | 12 +++++++-----
 .../basic_abilities/read_file/read_file_test.py    | 12 +++++++-----
 .../basic_abilities/write_file/w_file_data.json    |  2 +-
 .../basic_abilities/write_file/write_file_test.py  | 12 +++++++-----
 agbenchmark/tests/regression/regression_tests.json | 15 ++++++++++++++-
 6 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 9828a0e9e..d159296b1 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+import glob
 from agbenchmark.challenges.define_task_types import Ground
 
 
@@ -14,6 +14,26 @@ class Challenge:
         with open(workspace_dir, "r") as f:
             return f.read()
 
+    @staticmethod
+    def open_files(workspace: str, file_patterns: list):
+        script_dir = os.path.abspath(workspace)
+        files_contents = []
+
+        for file_pattern in file_patterns:
+            # Check if it is a file extension
+            if file_pattern.startswith("."):
+                # Find all files with the given extension in the workspace
+                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
+            else:
+                # Otherwise, it is a specific file
+                matching_files = [os.path.join(script_dir, file_pattern)]
+
+            for file_path in matching_files:
+                with open(file_path, "r") as f:
+                    files_contents.append(f.read())
+
+        return files_contents
+
     @staticmethod
     def write_to_file(workspace: str, filename: str, content: str):
         script_dir = os.path.abspath(workspace)
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 489d298fb..2a7d92a71 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, workspace, current_challenge_data):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 7d14228c8..90946670c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge):
     )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 1d2621081..037c5bd88 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -6,7 +6,7 @@
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": ["file_to_check.txt"]
+    "files": [".txt"]
   },
   "mock_func": "basic_write_file_mock",
   "info": {
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 330128898..187378ff1 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge):
     )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 9e26dfeeb..c84fc9c99 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1 +1,14 @@
-{}
\ No newline at end of file
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 4be22ae5abc884404370196bf71da86affe82131 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 26 Jun 2023 09:27:20 -0400
Subject: mini agi attempt

---
 agbenchmark/conftest.py                            | 46 +++++++++++++---------
 agbenchmark/tests/regression/regression_tests.json | 15 +------
 agent/agbenchmark_run.py                           | 27 +++++++++++++
 agent/mini-agi                                     |  1 +
 4 files changed, 56 insertions(+), 33 deletions(-)
 create mode 100644 agent/agbenchmark_run.py
 create mode 160000 agent/mini-agi

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 78114c204..b3b69f194 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -7,6 +7,7 @@ import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
 from agbenchmark.challenges.define_task_types import ChallengeData
+import subprocess
 
 
 @pytest.fixture(scope="module")
@@ -42,27 +43,34 @@ def server_response(request, config):
     else:
         task = request.param
         mock_function_name = None
-    # print(f"Server starting at {request.module}")
-    # try:
-    #     response = requests.post(
-    #         f"{config['hostname']}:{config['port']}", data={"task": task}
-    #     )
-    #     response.raise_for_status()  # This will raise an HTTPError if the status is 4xx or 5xx
-    # except RequestException:
-    #     # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock
-
-    if mock_function_name:
-        mock_manager = MockManager(
-            task
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_function_name)
-        mock_manager.delegate(mock_function_name)
-    else:
-        print("No mock provided")
 
+    # get the current file's directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # construct the script's path
+    script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
+
+    # form the command
+    command = ["python", script_path, task]
+
+    # if mock_function_name:
+    #     mock_manager = MockManager(
+    #         task
+    #     )  # workspace doesn't need to be passed in, stays the same
+    #     print("Server unavailable, using mock", mock_function_name)
+    #     mock_manager.delegate(mock_function_name)
     # else:
-    #     # This code is run if no exception occurred
-    #     print(f"Request succeeded with status code {response.status_code}")
+    #     print("No mock provided")
+
+    try:
+        # run the command and wait for it to complete
+        result = subprocess.run(
+            command, shell=True, check=True, text=True, capture_output=True
+        )
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"Subprocess failed with the following error:\n{e}")
+        # If the subprocess returns a non-zero exit status
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index c84fc9c99..9e26dfeeb 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1,14 +1 @@
-{
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "test_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
-    }
-}
\ No newline at end of file
+{}
\ No newline at end of file
diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py
new file mode 100644
index 000000000..f509f5e66
--- /dev/null
+++ b/agent/agbenchmark_run.py
@@ -0,0 +1,27 @@
+import argparse
+import subprocess
+import os
+
+
+def main(objective):
+    # get the current directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # form the command
+    command = (
+        f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
+    )
+
+    # run the command
+    subprocess.run(command, shell=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
+    parser.add_argument(
+        "objective", type=str, help="The objective to pass to miniagi.py"
+    )
+
+    args = parser.parse_args()
+
+    main(args.objective)
diff --git a/agent/mini-agi b/agent/mini-agi
new file mode 160000
index 000000000..d2add8f18
--- /dev/null
+++ b/agent/mini-agi
@@ -0,0 +1 @@
+Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b
-- 
cgit v1.2.3


From 84f170c9e0b310219566dbe9538ca1755019f424 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 26 Jun 2023 09:36:13 -0400
Subject: fixing relative imports

---
 agbenchmark/start_benchmark.py | 15 +++++++++++----
 agent/mini-agi                 |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)
 create mode 160000 agent/mini-agi

diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index b7a116ebc..3a6a2b860 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -17,8 +17,10 @@ def start(category, noreg):
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     config_file = "agbenchmark/config.json"
 
+    config_dir = os.path.abspath(config_file)
+
     # Check if configuration file exists and is not empty
-    if not os.path.exists(config_file) or os.stat(config_file).st_size == 0:
+    if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0:
         config = {}
 
         config["hostname"] = click.prompt(
@@ -26,16 +28,21 @@ def start(category, noreg):
         )
         config["port"] = click.prompt("Please enter a new port", default=8080)
         config["workspace"] = click.prompt(
-            "Please enter a new workspace path", default="/path/to/workspace"
+            "Please enter a new workspace path", default="agbenchmark/mocks/workspace"
         )
 
-        with open(config_file, "w") as f:
+        with open(config_dir, "w") as f:
             json.dump(config, f)
     else:
         # If the configuration file exists and is not empty, load it
-        with open(config_file, "r") as f:
+        with open(config_dir, "r") as f:
             config = json.load(f)
 
+    # create workspace directory if it doesn't exist
+    workspace_path = config_dir = os.path.abspath(config["workspace"])
+    if not os.path.exists(workspace_path):
+        os.makedirs(workspace_path, exist_ok=True)
+
     print("Current configuration:")
     for key, value in config.items():
         print(f"{key}: {value}")
diff --git a/agent/mini-agi b/agent/mini-agi
new file mode 160000
index 000000000..d2add8f18
--- /dev/null
+++ b/agent/mini-agi
@@ -0,0 +1 @@
+Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b
-- 
cgit v1.2.3


From a7972ad8737a8c5cebd3768f02013056c7594c93 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 27 Jun 2023 13:25:47 -0400
Subject: regression test creation

---
 agbenchmark/start_benchmark.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 3a6a2b860..6adcc09bf 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -39,10 +39,17 @@ def start(category, noreg):
             config = json.load(f)
 
     # create workspace directory if it doesn't exist
-    workspace_path = config_dir = os.path.abspath(config["workspace"])
+    workspace_path = os.path.abspath(config["workspace"])
     if not os.path.exists(workspace_path):
         os.makedirs(workspace_path, exist_ok=True)
 
+    regression_path = os.path.abspath(
+        "agbenchmark/tests/regression/regression_tests.txt"
+    )
+    if not os.path.exists(regression_path):
+        with open(regression_path, "a"):
+            pass
+
     print("Current configuration:")
     for key, value in config.items():
         print(f"{key}: {value}")
-- 
cgit v1.2.3


From 8c44b9eddf7c566d5e39f7e11149772b96e23a5f Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 09:42:36 -0400
Subject: basic challenges, more ChallengeData structure

---
 agbenchmark/Challenge.py                           | 22 ++++++++++++++++
 agbenchmark/challenges/define_task_types.py        | 16 ++++++++----
 agbenchmark/challenges/retrieval/Retrieval.py      | 22 +---------------
 agbenchmark/challenges/retrieval/r1/r1_data.json   | 10 +++++---
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  6 +++--
 agbenchmark/mocks/tests/basic_mocks.py             | 28 +++++++++++++++++++++
 agbenchmark/mocks/tests/retrieval_mocks.py         |  7 +-----
 .../basic_abilities/read_file/r_file_data.json     | 15 +++++++++++
 .../basic_abilities/read_file/read_file_test.py    | 29 ++++++++++++++++++++++
 .../tests/basic_abilities/read_file_test.py        |  0
 .../basic_abilities/write_file/w_file_data.json    | 16 ++++++++++++
 .../basic_abilities/write_file/write_file_test.py  | 27 ++++++++++++++++++++
 .../tests/basic_abilities/write_file_test.py       |  0
 pyproject.toml                                     |  3 ++-
 14 files changed, 163 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file_test.py
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file_test.py

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 20bf55853..9828a0e9e 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,6 @@
 import os
 from typing import Optional
+from agbenchmark.challenges.define_task_types import Ground
 
 
 class Challenge:
@@ -30,3 +31,24 @@ class Challenge:
             for filename in os.listdir(workspace)
             if os.path.isfile(os.path.join(workspace, filename))
         ]
+
+    def scoring(self, content: str, ground: Ground):
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if should_contain_word not in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should exist: {should_contain_word} exists in the content"
+                    )
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if should_not_contain_word in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                    )
+
+        return 1.0
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index f1a841b53..879a46af0 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,12 @@ import json
 import os
 
 
+class Info(BaseModel):
+    difficulty: str
+    description: str
+    side_effects: List[str]
+
+
 class Ground(BaseModel):
     answer: str
     should_contain: Optional[List[str]]
@@ -11,20 +17,20 @@ class Ground(BaseModel):
     files: List[str]
 
 
-class Challenge(BaseModel):
-    category: str
+class ChallengeData(BaseModel):
+    category: List[str]
     task: str
     ground: Ground
-    difficulty: str
     mock_func: Optional[str] = None
+    info: Info
 
     def serialize(self, path: str) -> None:
         with open(path, "w") as file:
             file.write(self.json())
 
     @staticmethod
-    def deserialize(path: str) -> "Challenge":
+    def deserialize(path: str) -> "ChallengeData":
         print("Deserializing", path)
         with open(path, "r") as file:
             data = json.load(file)
-        return Challenge(**data)
+        return ChallengeData(**data)
diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 2db22ae4d..9434d69c3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,27 +1,7 @@
 from agbenchmark.Challenge import Challenge
-from agbenchmark.challenges.define_task_types import Ground
 
 
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
-    def scoring(self, content: str, ground: Ground):
-        if ground.should_contain:
-            for should_contain_word in ground.should_contain:
-                if should_contain_word not in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
-
-        if ground.should_not_contain:
-            for should_not_contain_word in ground.should_not_contain:
-                if should_not_contain_word in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
-                    )
-
-        return 1.0
+    pass
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index c7cc31004..08b74d1b7 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,5 @@
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -7,6 +7,10 @@
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy",
-  "mock_func": "retrieval_1_mock"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index e20c9f7b9..d37c5e795 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,9 +1,11 @@
 import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
-from agbenchmark.challenges.define_task_types import Challenge, Ground
+from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
-data = Challenge.deserialize(os.path.join(os.path.dirname(__file__), "r1_data.json"))
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r1_data.json")
+)
 
 
 class TestRetrieval1(RetrievalChallenge):
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index e69de29bb..eb7b96541 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -0,0 +1,28 @@
+from agbenchmark.Challenge import Challenge
+from ..basic_gpt_agent import basic_gpt_agent
+
+
+def basic_read_file_mock(task: str, workspace: str):
+    """
+    This mock reads a file and returns its content.
+    """
+
+    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+
+    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
+
+    Challenge.write_to_file(
+        workspace, "file_to_check.txt", f"random string: {file_contents}"
+    )
+
+
+def basic_write_file_mock(task: str, workspace: str):
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+
+    # Call the basic_gpt_agent to get a response.
+    response = basic_gpt_agent(task)
+
+    # Open the file in write mode.
+    Challenge.write_to_file(workspace, "file_to_check.txt", response)
diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py
index 23f4bde17..2481de060 100644
--- a/agbenchmark/mocks/tests/retrieval_mocks.py
+++ b/agbenchmark/mocks/tests/retrieval_mocks.py
@@ -1,4 +1,3 @@
-from ..basic_gpt_agent import basic_gpt_agent
 from agbenchmark.Challenge import Challenge
 
 
@@ -6,8 +5,4 @@ from agbenchmark.Challenge import Challenge
 # Prerequisites here would be writing to a file (basic_abilities test).
 # Should also check if prerequisites exists in regression file
 def retrieval_1_mock(task: str, workspace: str):
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
new file mode 100644
index 000000000..55319ddfc
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -0,0 +1,15 @@
+{
+  "category": ["basic"],
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "ground": {
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_read_file_mock",
+  "info": {
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
new file mode 100644
index 000000000..610ccdab6
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -0,0 +1,29 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
+)
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(
+        self, workspace
+    ):  # create_file simply there for the function to depend on the fixture
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
new file mode 100644
index 000000000..4aaa1347d
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -0,0 +1,16 @@
+{
+  "category": ["basic"],
+  "task": "What is the capital of America?",
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": ["file_to_check.txt"]
+  },
+  "mock_func": "basic_write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
new file mode 100644
index 000000000..ccb10fe70
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -0,0 +1,27 @@
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from agbenchmark.Challenge import Challenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "w_file_data.json")
+)
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    @pytest.mark.basic
+    def test_retrieval(self, workspace):
+        file = self.open_file(workspace, data.ground.files[0])
+
+        score = self.scoring(file, data.ground)
+
+        print("You score is:", score)
+
+        assert score
diff --git a/agbenchmark/tests/basic_abilities/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/pyproject.toml b/pyproject.toml
index 5498381a2..6f79e75ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,8 @@ testpaths = [
 ]
 markers = [
     "retrieval",
-    "regression"
+    "regression",
+    "basic"
 ]
 
 [tool.poetry.scripts]
-- 
cgit v1.2.3


From 22458a04e81f6a4e200581fe4046182b96f6e17c Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:15:53 -0400
Subject: file creation from within file before server :)

---
 agbenchmark/conftest.py                                       | 2 +-
 agbenchmark/mocks/tests/basic_mocks.py                        | 2 +-
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py | 8 ++++++++
 agbenchmark/tests/regression/regression_tests.txt             | 2 ++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 908d39e89..434f6dbde 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -17,7 +17,7 @@ def config():
     return config
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def workspace(config):
     yield config["workspace"]
     # teardown after test function completes
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index eb7b96541..bbff6a9c7 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,7 +7,7 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
+    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
 
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 610ccdab6..35d1d80c5 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -8,6 +8,14 @@ data = ChallengeData.deserialize(
 )
 
 
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
+
+
 class TestReadFile(Challenge):
     """Testing if LLM can read a file"""
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index e69de29bb..a5f8fbd1d 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -0,0 +1,2 @@
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
-- 
cgit v1.2.3


From 60a7ac2343df15127e38da5d490edab887f81608 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 12:24:17 -0400
Subject: adding dependencies on other challenges

---
 agbenchmark/mocks/tests/basic_mocks.py                    |  2 --
 .../tests/basic_abilities/read_file/read_file_test.py     |  1 +
 .../tests/basic_abilities/write_file/write_file_test.py   |  1 +
 agbenchmark/tests/regression/regression_tests.txt         |  1 -
 poetry.lock                                               | 15 ++++++++++++++-
 pyproject.toml                                            |  1 +
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index bbff6a9c7..550095b72 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -7,8 +7,6 @@ def basic_read_file_mock(task: str, workspace: str):
     This mock reads a file and returns its content.
     """
 
-    # Challenge.write_to_file(workspace, "file_to_check.txt", "this is how we're doing")
-
     file_contents = Challenge.open_file(workspace, "file_to_check.txt")
 
     Challenge.write_to_file(
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 35d1d80c5..ea794281e 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,7 @@ class TestReadFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(depends=["write_file"])
     def test_retrieval(
         self, workspace
     ):  # create_file simply there for the function to depend on the fixture
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index ccb10fe70..b2c559c9e 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -17,6 +17,7 @@ class TestWriteFile(Challenge):
         indirect=True,
     )
     @pytest.mark.basic
+    @pytest.mark.dependency(name="write_file")
     def test_retrieval(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index a5f8fbd1d..84e625af4 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,2 +1 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3f1059aaf..3bc37622e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -595,6 +595,19 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
+[[package]]
+name = "pytest-dependency"
+version = "0.5.1"
+description = "Manage dependencies of tests"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
+]
+
+[package.dependencies]
+pytest = ">=3.6.0"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -765,4 +778,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a13e69f2bd9e511e1af92ed02b155a90dec38a9b8d983a711e1b67931b467d38"
+content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
diff --git a/pyproject.toml b/pyproject.toml
index 6f79e75ce..087ac8447 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
+pytest-dependency = "^0.5.1"
 
 
 [build-system]
-- 
cgit v1.2.3


From 2f28a66591ea37715282271ccf92560e89a7924a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 24 Jun 2023 14:42:35 -0400
Subject: more elegant marking & dependency solution

---
 README.md                                          | 74 +++++++++++++++++++---
 agbenchmark/challenges/README.md                   | 38 +++++------
 agbenchmark/challenges/define_task_types.py        |  1 +
 agbenchmark/challenges/retrieval/r1/r1_data.json   |  1 +
 .../tests/basic_abilities/BasicChallenge.py        |  7 ++
 .../basic_abilities/read_file/r_file_data.json     |  1 +
 .../basic_abilities/read_file/read_file_test.py    | 12 ++--
 .../basic_abilities/write_file/w_file_data.json    |  1 +
 .../basic_abilities/write_file/write_file_test.py  |  9 ++-
 agbenchmark/tests/regression/regression_tests.txt  |  2 +
 poetry.lock                                        | 17 ++++-
 pyproject.toml                                     |  1 +
 12 files changed, 126 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py

diff --git a/README.md b/README.md
index 0a8d119af..0ad0cf345 100644
--- a/README.md
+++ b/README.md
@@ -51,15 +51,73 @@ Share your progress :)
 
 to create a test:
 
-```
-@pytest.mark.parametrize(
-"server_response",
-["VARIABLE"], # VARIABLE = the query/goal you provide to the model
-indirect=True,
+```python
+import pytest
+from agbenchmark.challenges.define_task_types import ChallengeData
+from ..CategoryChallenge import CategoryChallenge
+import os
+
+data = ChallengeData.deserialize(
+    os.path.join(os.path.dirname(__file__), "r_file_data.json")
 )
-@pytest.mark.(VARIABLE) # VARIABLE = category of the test
-def test_file_in_workspace(workspace): # VARIABLE = the actual test that asserts
-assert os.path.exists(os.path.join(workspace, "file_to_check.txt"))
+
+class TestSomething(CategoryChallenge):
+    """Testing if LLM can read a file"""
+
+    @pytest.mark.parametrize(
+        "server_response",
+        [(data.task, data.mock_func)],
+        indirect=True,
+    )
+    def test_retrieval(
+        self, workspace
+    ):
+        # scoring logic goes here
+```
+
+All challenges will inherit from parent class which has the mark
+
+```python
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
+```
+
+If you want to add a custom mark to a Challenge, you must specify it before the test definition
+
+```python
+@pytest.mark.other_mark
+def test_retrieval(self, workspace):
+```
+
+To add a dependency to a challenge use the following
+
+```python
+# to defining what a test depends on
+from pytest_dependency import depends
+
+def test1(self, request, workspace):
+   depends(request, data.dependencies)
+# for defining a test as a dependency
+@pytest.mark.dependency()
+def test2
+```
+
+Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
+
+```python
+@pytest.mark.run(order=1)
+```
+
+To create a file to test a challenge, add this to the challenge file which will create a file before running the server
+
+```python
+@pytest.fixture(scope="module", autouse=True)
+def setup_module(workspace):
+    if data.ground.should_contain:
+        Challenge.write_to_file(
+            workspace, data.ground.files[0], "this is how we're doing"
+        )
 ```
 
 ## Api
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 50efe2c4d..d5229e937 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,28 +4,25 @@
 
 Input:
 
-- **category** (str): information-retrieval
-- **difficulty**(str): the difficulty of this query. choices from
-
-## Information-retrieval challenges
-
-Input:
-
-- **category** (str): information-retrieval
-- **task** (str): the question the agent needs to be solve.
+- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **task** (str): The task that the agent needs to solve.
+- **dependencies** (str[]): The dependencies that the challenge needs to run.
 - **ground** (dict): The ground truth.
-  - **answer** (str): The raw text of ground truth answer
-  - **should_contain** (list): the exact strings that is required in the final answer
-  - **should_not_contain** (list): the exact strings that should not be in the final answer
-  - **files**: files that the are used for retrieval. Can specify file here or an extension **TODO:** like .txt
-- **difficulty**(str): the difficulty of this query. choices from
-- **mock_func**: function to mock the agent's response. This is used for testing purposes
+  - **answer** (str): The raw text of the ground truth answer.
+  - **should_contain** (list): The exact strings that are required in the final answer.
+  - **should_not_contain** (list): The exact strings that should not be in the final answer.
+  - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
+- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **info** (dict): Additional info about the challenge.
+  - **difficulty** (str): The difficulty of this query.
+  - **description** (str): Description of the challenge.
+  - **side_effects** (str[]): Describes the effects of the challenge.
 
 Example:
 
 ```python
 {
-  "category": "retrieval",
+  "category": ["basic"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
@@ -33,11 +30,16 @@ Example:
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "difficulty": "easy"
+  "mock_func": "write_file_mock",
+  "info": {
+    "difficulty": "easy",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
 }
 
 ```
 
-Output:
+Current Output:
 
 - **score** (float): scores range from [0, 1]
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 879a46af0..694671218 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -20,6 +20,7 @@ class Ground(BaseModel):
 class ChallengeData(BaseModel):
     category: List[str]
     task: str
+    dependencies: List[str]
     ground: Ground
     mock_func: Optional[str] = None
     info: Info
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 08b74d1b7..fe05b6d51 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,5 +1,6 @@
 {
   "category": ["basic"],
+  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
   "ground": {
     "answer": "Washington",
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
new file mode 100644
index 000000000..563207405
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -0,0 +1,7 @@
+import pytest
+from agbenchmark.Challenge import Challenge
+
+
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 55319ddfc..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ea794281e..03b2d6cab 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,7 +1,9 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
+from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -16,7 +18,7 @@ def setup_module(workspace):
         )
 
 
-class TestReadFile(Challenge):
+class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
     @pytest.mark.parametrize(
@@ -24,11 +26,9 @@ class TestReadFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(depends=["write_file"])
-    def test_retrieval(
-        self, workspace
-    ):  # create_file simply there for the function to depend on the fixture
+    def test_read_file(self, request, workspace):
+        depends(request, data.dependencies)
+
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 4aaa1347d..562d1c364 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
   "category": ["basic"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b2c559c9e..b09162e3d 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,6 @@
 import pytest
 from agbenchmark.challenges.define_task_types import ChallengeData
-from agbenchmark.Challenge import Challenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
 data = ChallengeData.deserialize(
@@ -8,7 +8,7 @@ data = ChallengeData.deserialize(
 )
 
 
-class TestWriteFile(Challenge):
+class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
     @pytest.mark.parametrize(
@@ -16,9 +16,8 @@ class TestWriteFile(Challenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.basic
-    @pytest.mark.dependency(name="write_file")
-    def test_retrieval(self, workspace):
+    @pytest.mark.dependency()
+    def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 84e625af4..b831003fc 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1 +1,3 @@
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 3bc37622e..f6f24c5f2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,21 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-ordering"
+version = "0.6"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
+    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
+    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
+]
+
+[package.dependencies]
+pytest = "*"
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -778,4 +793,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "4a1629eb643b5b68d47f6d1407942aa6d4a796c6d5a1b6a54bbc096b9d0efa2d"
+content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
diff --git a/pyproject.toml b/pyproject.toml
index 087ac8447..faee61c2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
+pytest-ordering = "^0.6"
 
 
 [build-system]
-- 
cgit v1.2.3


From 06a6f080543ddffd8baf3aaf51ec97ff1fce86b3 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 00:22:53 -0400
Subject: finally figured out right way to do dependencies

---
 agbenchmark/challenges/retrieval/Retrieval.py         |  2 ++
 agbenchmark/challenges/retrieval/r1/r1_data.json      |  4 ++--
 agbenchmark/challenges/retrieval/r1/r1_test.py        |  6 ++++--
 agbenchmark/tests/basic_abilities/BasicChallenge.py   |  1 +
 .../tests/basic_abilities/read_file/r_file_data.json  |  4 +++-
 .../tests/basic_abilities/read_file/read_file_test.py |  6 ++----
 .../basic_abilities/write_file/write_file_test.py     |  1 -
 agbenchmark/tests/regression/regression_tests.txt     |  4 ++--
 poetry.lock                                           | 19 ++++++++++++++++++-
 pyproject.toml                                        |  3 ++-
 10 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index 9434d69c3..b8aa81ce3 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,6 +1,8 @@
 from agbenchmark.Challenge import Challenge
+import pytest
 
 
+@pytest.mark.retrieval
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index fe05b6d51..562d1c364 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,14 +1,14 @@
 {
   "category": ["basic"],
-  "dependencies": ["test_write_file"],
   "task": "What is the capital of America?",
+  "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_write_file_mock",
   "info": {
     "difficulty": "easy",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index d37c5e795..5e6d6abf4 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,6 +2,8 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
+from pytest_dependency import depends
+
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r1_data.json")
@@ -16,8 +18,8 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.retrieval
-    def test_retrieval(self, workspace):
+    def test_retrieval(self, request, workspace):
+        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 563207405..0cada86cc 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,6 +2,7 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
+@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 8c5ef62db..4d04f33e7 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,7 +1,9 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["test_write_file"],
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 03b2d6cab..ad08da4e0 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -3,7 +3,6 @@ from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
-from pytest_dependency import depends
 
 data = ChallengeData.deserialize(
     os.path.join(os.path.dirname(__file__), "r_file_data.json")
@@ -26,9 +25,8 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_read_file(self, request, workspace):
-        depends(request, data.dependencies)
-
+    @pytest.mark.order(after=data.dependencies)
+    def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index b09162e3d..4c94320e0 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,7 +16,6 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.dependency()
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index b831003fc..df27f3124 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_retrieval[server_response0]
 agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
+agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index f6f24c5f2..4764bf493 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -608,6 +608,23 @@ files = [
 [package.dependencies]
 pytest = ">=3.6.0"
 
+[[package]]
+name = "pytest-order"
+version = "1.1.0"
+description = "pytest plugin to run your tests in a specific order"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
+    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
+]
+
+[package.dependencies]
+pytest = [
+    {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
+]
+
 [[package]]
 name = "pytest-ordering"
 version = "0.6"
@@ -793,4 +810,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "65b68e43440faafbd2883edd6b10bc177ab334380e908c27c9f511703065f8e7"
+content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
diff --git a/pyproject.toml b/pyproject.toml
index faee61c2d..fd2c52041 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-dependency = "^0.5.1"
 pytest-ordering = "^0.6"
+pytest-order = "^1.1.0"
 
 
 [build-system]
@@ -24,7 +25,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "-ra -q"
+addopts = "--order-dependencies" # -ra -q 
 testpaths = [
     "tests", "agbenchmark",
 ]
-- 
cgit v1.2.3


From a2f79760ce8abdddfc27c5b0b42a58df903b352c Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 08:48:16 -0400
Subject: other was non solution, solution is pytest-depends

---
 agbenchmark/challenges/README.md                   | 20 +++---
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  2 -
 .../tests/basic_abilities/BasicChallenge.py        |  1 -
 .../basic_abilities/read_file/r_file_data.json     |  4 +-
 .../basic_abilities/read_file/read_file_test.py    |  2 +-
 .../basic_abilities/write_file/write_file_test.py  |  1 +
 agbenchmark/tests/regression/regression_tests.txt  |  2 +-
 poetry.lock                                        | 80 ++++++++++++----------
 pyproject.toml                                     |  6 +-
 9 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index d5229e937..e457b85c4 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -6,7 +6,7 @@ Input:
 
 - **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
 - **task** (str): The task that the agent needs to solve.
-- **dependencies** (str[]): The dependencies that the challenge needs to run.
+- **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
 - **ground** (dict): The ground truth.
   - **answer** (str): The raw text of the ground truth answer.
   - **should_contain** (list): The exact strings that are required in the final answer.
@@ -23,18 +23,20 @@ Example:
 ```python
 {
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": [
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
+  ],
   "ground": {
-    "answer": "Washington",
-    "should_contain": ["Washington"],
-    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "answer": "random string: this is how we're doing",
+    "should_contain": ["random string: this is how we're doing"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "write_file_mock",
+  "mock_func": "basic_read_file_mock",
   "info": {
-    "difficulty": "easy",
-    "description": "Tests the writing to file",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
   }
 }
 
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 5e6d6abf4..45becaf75 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
-from pytest_dependency import depends
 
 
 data = ChallengeData.deserialize(
@@ -19,7 +18,6 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, request, workspace):
-        depends(request, data.dependencies)
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 0cada86cc..563207405 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -2,7 +2,6 @@ import pytest
 from agbenchmark.Challenge import Challenge
 
 
-@pytest.mark.run(order=1)
 @pytest.mark.basic
 class BasicChallenge(Challenge):
     pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 4d04f33e7..8c5ef62db 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,9 +1,7 @@
 {
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": [
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
-  ],
+  "dependencies": ["test_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index ad08da4e0..494a9b071 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,7 +25,7 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    @pytest.mark.order(after=data.dependencies)
+    @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 4c94320e0..0a4ef4a2c 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,7 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
 
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index df27f3124..57b94cd7a 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,3 @@
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
+agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
 agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
diff --git a/poetry.lock b/poetry.lock
index 4764bf493..d7939fbfe 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -368,6 +368,20 @@ files = [
     {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
 ]
 
+[[package]]
+name = "future-fstrings"
+version = "1.2.0"
+description = "A backport of fstrings to python<3.6"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "future_fstrings-1.2.0-py2.py3-none-any.whl", hash = "sha256:90e49598b553d8746c4dc7d9442e0359d038c3039d802c91c0a55505da318c63"},
+    {file = "future_fstrings-1.2.0.tar.gz", hash = "sha256:6cf41cbe97c398ab5a81168ce0dbb8ad95862d3caf23c21e4430627b90844089"},
+]
+
+[package.extras]
+rewrite = ["tokenize-rt (>=3)"]
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -473,6 +487,24 @@ files = [
     {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
 ]
 
+[[package]]
+name = "networkx"
+version = "3.1"
+description = "Python package for creating and manipulating graphs and networks"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"},
+    {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"},
+]
+
+[package.extras]
+default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"]
+developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"]
+doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"]
+extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
+test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
+
 [[package]]
 name = "openai"
 version = "0.27.8"
@@ -596,49 +628,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
-name = "pytest-dependency"
-version = "0.5.1"
-description = "Manage dependencies of tests"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytest-dependency-0.5.1.tar.gz", hash = "sha256:c2a892906192663f85030a6ab91304e508e546cddfe557d692d61ec57a1d946b"},
-]
-
-[package.dependencies]
-pytest = ">=3.6.0"
-
-[[package]]
-name = "pytest-order"
-version = "1.1.0"
-description = "pytest plugin to run your tests in a specific order"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"},
-    {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"},
-]
-
-[package.dependencies]
-pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
-]
-
-[[package]]
-name = "pytest-ordering"
-version = "0.6"
-description = "pytest plugin to run your tests in a specific order"
+name = "pytest-depends"
+version = "1.0.1"
+description = "Tests that depend on other tests"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pytest-ordering-0.6.tar.gz", hash = "sha256:561ad653626bb171da78e682f6d39ac33bb13b3e272d406cd555adb6b006bda6"},
-    {file = "pytest_ordering-0.6-py2-none-any.whl", hash = "sha256:27fba3fc265f5d0f8597e7557885662c1bdc1969497cd58aff6ed21c3b617de2"},
-    {file = "pytest_ordering-0.6-py3-none-any.whl", hash = "sha256:3f314a178dbeb6777509548727dc69edf22d6d9a2867bf2d310ab85c403380b6"},
+    {file = "pytest-depends-1.0.1.tar.gz", hash = "sha256:90a28e2b87b75b18abd128c94015248544acac20e4392e9921e5a86f93319dfe"},
+    {file = "pytest_depends-1.0.1-py3-none-any.whl", hash = "sha256:a1df072bcc93d77aca3f0946903f5fed8af2d9b0056db1dfc9ed5ac164ab0642"},
 ]
 
 [package.dependencies]
-pytest = "*"
+colorama = "*"
+future-fstrings = "*"
+networkx = "*"
+pytest = ">=3"
 
 [[package]]
 name = "requests"
@@ -810,4 +814,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "64d22c864fe244497b7ebc81ead1be0b0570b14ee1ced323813d427672e17ff3"
+content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
diff --git a/pyproject.toml b/pyproject.toml
index fd2c52041..0a4f8ba73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,9 +14,7 @@ click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
-pytest-dependency = "^0.5.1"
-pytest-ordering = "^0.6"
-pytest-order = "^1.1.0"
+pytest-depends = "^1.0.1"
 
 
 [build-system]
@@ -25,7 +23,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = "--order-dependencies" # -ra -q 
+addopts = "-ra -q"
 testpaths = [
     "tests", "agbenchmark",
 ]
-- 
cgit v1.2.3


From 2411c35d0eb0af6ff0fb4a64ac2b431ea2d41adb Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 11:12:33 -0400
Subject: update regression tests info

---
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  7 ++++-
 agbenchmark/conftest.py                            | 36 +++++++++++++++-------
 .../basic_abilities/read_file/read_file_test.py    |  5 +++
 .../basic_abilities/write_file/w_file_data.json    |  2 +-
 .../basic_abilities/write_file/write_file_test.py  |  5 +++
 agbenchmark/tests/regression/RegressionManager.py  | 25 +++++++++------
 agbenchmark/tests/regression/regression_tests.json |  1 +
 agbenchmark/tests/regression/regression_tests.txt  | 17 ++++++++--
 8 files changed, 73 insertions(+), 25 deletions(-)
 create mode 100644 agbenchmark/tests/regression/regression_tests.json

diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 45becaf75..489d298fb 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -17,7 +17,12 @@ class TestRetrieval1(RetrievalChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
-    def test_retrieval(self, request, workspace):
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
+    def test_retrieval(self, workspace, current_challenge_data):
         file = self.open_file(workspace, data.ground.files[0])
 
         score = self.scoring(file, data.ground)
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 434f6dbde..78114c204 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,6 +6,7 @@ from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
+from agbenchmark.challenges.define_task_types import ChallengeData
 
 
 @pytest.fixture(scope="module")
@@ -64,21 +65,34 @@ def server_response(request, config):
     #     print(f"Request succeeded with status code {response.status_code}")
 
 
-regression_txt = "agbenchmark/tests/regression/regression_tests.txt"
+regression_json = "agbenchmark/tests/regression/regression_tests.json"
 
-regression_manager = RegressionManager(regression_txt)
+regression_manager = RegressionManager(regression_json)
+
+
+# this is to get the challenge_data from every test
+@pytest.fixture(autouse=True)
+def regression_data(request):
+    return request.param
 
 
 def pytest_runtest_makereport(item, call):
-    """Called for each test report. Generated for each stage
-    of a test run (setup, call, teardown)."""
     if call.when == "call":
-        if (
-            call.excinfo is None
-        ):  # if no error in the call stage, add it as a regression test
-            regression_manager.add_test(item.nodeid)
-        else:  # otherwise, :(
-            regression_manager.remove_test(item.nodeid)
+        challenge_data = item.funcargs.get("regression_data", None)
+        difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
+        dependencies = challenge_data.dependencies if challenge_data else []
+
+        test_details = {
+            "difficulty": difficulty,
+            "dependencies": dependencies,
+            "test": item.nodeid,
+        }
+
+        print("pytest_runtest_makereport", test_details)
+        if call.excinfo is None:
+            regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+        else:
+            regression_manager.remove_test(item.nodeid.split("::")[1])
 
 
 def pytest_collection_modifyitems(items):
@@ -86,7 +100,7 @@ def pytest_collection_modifyitems(items):
     to add regression marker to collected test items."""
     for item in items:
         print("pytest_collection_modifyitems", item.nodeid)
-        if item.nodeid + "\n" in regression_manager.tests:
+        if item.nodeid.split("::")[1] in regression_manager.tests:
             print(regression_manager.tests)
             item.add_marker(pytest.mark.regression)
 
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 494a9b071..7d14228c8 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -25,6 +25,11 @@ class TestReadFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 562d1c364..1d2621081 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -10,7 +10,7 @@
   },
   "mock_func": "basic_write_file_mock",
   "info": {
-    "difficulty": "easy",
+    "difficulty": "basic",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 0a4ef4a2c..330128898 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,6 +16,11 @@ class TestWriteFile(BasicChallenge):
         [(data.task, data.mock_func)],
         indirect=True,
     )
+    @pytest.mark.parametrize(
+        "regression_data",
+        [data],
+        indirect=True,
+    )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
         file = self.open_file(workspace, data.ground.files[0])
diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py
index 9117d53f1..a1379ecae 100644
--- a/agbenchmark/tests/regression/RegressionManager.py
+++ b/agbenchmark/tests/regression/RegressionManager.py
@@ -1,3 +1,6 @@
+import json
+
+
 class RegressionManager:
     """Abstracts interaction with the regression tests file"""
 
@@ -6,17 +9,21 @@ class RegressionManager:
         self.load()
 
     def load(self) -> None:
-        with open(self.filename, "r") as f:
-            self.tests = f.readlines()
+        try:
+            with open(self.filename, "r") as f:
+                self.tests = json.load(f)
+        except (FileNotFoundError, json.decoder.JSONDecodeError):
+            self.tests = {}
 
     def save(self) -> None:
         with open(self.filename, "w") as f:
-            f.writelines(self.tests)
+            json.dump(self.tests, f, indent=4)
 
-    def add_test(self, test_id) -> None:
-        if f"{test_id}\n" not in self.tests:
-            self.tests.append(f"{test_id}\n")
+    def add_test(self, test_name: str, test_details: dict) -> None:
+        self.tests[test_name] = test_details
+        self.save()
 
-    def remove_test(self, test_id) -> None:
-        if f"{test_id}\n" in self.tests:
-            self.tests.remove(f"{test_id}\n")
+    def remove_test(self, test_name: str) -> None:
+        if test_name in self.tests:
+            del self.tests[test_name]
+            self.save()
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
new file mode 100644
index 000000000..9e26dfeeb
--- /dev/null
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -0,0 +1 @@
+{}
\ No newline at end of file
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
index 57b94cd7a..8af722f07 100644
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ b/agbenchmark/tests/regression/regression_tests.txt
@@ -1,3 +1,14 @@
-agbenchmark/challenges/retrieval/r1/r1_test.py::TestRetrieval1::test_retrieval[server_response0]
-agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[server_response0]
-agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[server_response0]
+{
+    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
+        "difficulty": "easy",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From d6a6e69f2e3ed1cd4bb1715ae737ad50d6b17cb9 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 25 Jun 2023 19:30:04 -0400
Subject: can now put file extensions or names in files data

---
 agbenchmark/Challenge.py                           | 22 +++++++++++++++++++++-
 agbenchmark/challenges/retrieval/r1/r1_test.py     | 12 +++++++-----
 .../basic_abilities/read_file/read_file_test.py    | 12 +++++++-----
 .../basic_abilities/write_file/w_file_data.json    |  2 +-
 .../basic_abilities/write_file/write_file_test.py  | 12 +++++++-----
 agbenchmark/tests/regression/regression_tests.json | 15 ++++++++++++++-
 6 files changed, 57 insertions(+), 18 deletions(-)

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 9828a0e9e..d159296b1 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional
+import glob
 from agbenchmark.challenges.define_task_types import Ground
 
 
@@ -14,6 +14,26 @@ class Challenge:
         with open(workspace_dir, "r") as f:
             return f.read()
 
+    @staticmethod
+    def open_files(workspace: str, file_patterns: list):
+        script_dir = os.path.abspath(workspace)
+        files_contents = []
+
+        for file_pattern in file_patterns:
+            # Check if it is a file extension
+            if file_pattern.startswith("."):
+                # Find all files with the given extension in the workspace
+                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
+            else:
+                # Otherwise, it is a specific file
+                matching_files = [os.path.join(script_dir, file_pattern)]
+
+            for file_path in matching_files:
+                with open(file_path, "r") as f:
+                    files_contents.append(f.read())
+
+        return files_contents
+
     @staticmethod
     def write_to_file(workspace: str, filename: str, content: str):
         script_dir = os.path.abspath(workspace)
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 489d298fb..2a7d92a71 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -23,10 +23,12 @@ class TestRetrieval1(RetrievalChallenge):
         indirect=True,
     )
     def test_retrieval(self, workspace, current_challenge_data):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 7d14228c8..90946670c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -32,10 +32,12 @@ class TestReadFile(BasicChallenge):
     )
     @pytest.mark.depends(on=data.dependencies)
     def test_read_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 1d2621081..037c5bd88 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -6,7 +6,7 @@
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": ["file_to_check.txt"]
+    "files": [".txt"]
   },
   "mock_func": "basic_write_file_mock",
   "info": {
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 330128898..187378ff1 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -23,10 +23,12 @@ class TestWriteFile(BasicChallenge):
     )
     @pytest.mark.depends(name="test_write_file")
     def test_write_file(self, workspace):
-        file = self.open_file(workspace, data.ground.files[0])
+        files_contents = self.open_files(workspace, data.ground.files)
 
-        score = self.scoring(file, data.ground)
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, data.ground)
+            print("Your score is:", score)
+            scores.append(score)
 
-        print("You score is:", score)
-
-        assert score
+        assert 1 in scores
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 9e26dfeeb..c84fc9c99 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1 +1,14 @@
-{}
\ No newline at end of file
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "test_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From fa0df12439b7beea91a46f08e7f6154900dc1047 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 26 Jun 2023 09:27:20 -0400
Subject: mini agi attempt

---
 agbenchmark/conftest.py                            | 46 +++++++++++++---------
 agbenchmark/tests/regression/regression_tests.json | 15 +------
 agent/agbenchmark_run.py                           | 27 +++++++++++++
 3 files changed, 55 insertions(+), 33 deletions(-)
 create mode 100644 agent/agbenchmark_run.py

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 78114c204..b3b69f194 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -7,6 +7,7 @@ import requests
 from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
 from agbenchmark.challenges.define_task_types import ChallengeData
+import subprocess
 
 
 @pytest.fixture(scope="module")
@@ -42,27 +43,34 @@ def server_response(request, config):
     else:
         task = request.param
         mock_function_name = None
-    # print(f"Server starting at {request.module}")
-    # try:
-    #     response = requests.post(
-    #         f"{config['hostname']}:{config['port']}", data={"task": task}
-    #     )
-    #     response.raise_for_status()  # This will raise an HTTPError if the status is 4xx or 5xx
-    # except RequestException:
-    #     # If an exception occurs (could be connection, timeout, or HTTP errors), we use the mock
-
-    if mock_function_name:
-        mock_manager = MockManager(
-            task
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_function_name)
-        mock_manager.delegate(mock_function_name)
-    else:
-        print("No mock provided")
 
+    # get the current file's directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # construct the script's path
+    script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
+
+    # form the command
+    command = ["python", script_path, task]
+
+    # if mock_function_name:
+    #     mock_manager = MockManager(
+    #         task
+    #     )  # workspace doesn't need to be passed in, stays the same
+    #     print("Server unavailable, using mock", mock_function_name)
+    #     mock_manager.delegate(mock_function_name)
     # else:
-    #     # This code is run if no exception occurred
-    #     print(f"Request succeeded with status code {response.status_code}")
+    #     print("No mock provided")
+
+    try:
+        # run the command and wait for it to complete
+        result = subprocess.run(
+            command, shell=True, check=True, text=True, capture_output=True
+        )
+        return result
+    except subprocess.CalledProcessError as e:
+        print(f"Subprocess failed with the following error:\n{e}")
+        # If the subprocess returns a non-zero exit status
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index c84fc9c99..9e26dfeeb 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1,14 +1 @@
-{
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "test_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
-    }
-}
\ No newline at end of file
+{}
\ No newline at end of file
diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py
new file mode 100644
index 000000000..f509f5e66
--- /dev/null
+++ b/agent/agbenchmark_run.py
@@ -0,0 +1,27 @@
+import argparse
+import subprocess
+import os
+
+
+def main(objective):
+    # get the current directory
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+
+    # form the command
+    command = (
+        f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
+    )
+
+    # run the command
+    subprocess.run(command, shell=True)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
+    parser.add_argument(
+        "objective", type=str, help="The objective to pass to miniagi.py"
+    )
+
+    args = parser.parse_args()
+
+    main(args.objective)
-- 
cgit v1.2.3


From f933717d8b6f28e268437e000a57e187076287af Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 27 Jun 2023 18:17:54 -0400
Subject: mini-agi, simple challenge creation, --mock flag

---
 .env.example                                       |   4 +
 README.md                                          |   2 +-
 agbenchmark/Challenge.py                           |  53 ++++++++++-
 agbenchmark/challenges/define_task_types.py        |  12 ++-
 agbenchmark/challenges/retrieval/r1/r1_data.json   |  12 ++-
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  24 ++---
 agbenchmark/config.json                            |   2 +-
 agbenchmark/conftest.py                            | 103 ++++++++++++++-------
 agbenchmark/start_benchmark.py                     |  20 +++-
 .../tests/basic_abilities/BasicChallenge.py        |   2 +
 .../basic_abilities/read_file/r_file_data.json     |   7 +-
 .../basic_abilities/read_file/read_file_test.py    |  39 +++-----
 .../basic_abilities/write_file/w_file_data.json    |   8 +-
 .../basic_abilities/write_file/write_file_test.py  |  26 ++----
 agbenchmark/tests/regression/regression_tests.json |  15 ++-
 agbenchmark/tests/regression/regression_tests.txt  |  14 ---
 agent/agbenchmark_run.py                           |  27 ------
 poetry.lock                                        |  16 +++-
 pyproject.toml                                     |   3 +-
 19 files changed, 233 insertions(+), 156 deletions(-)
 create mode 100644 .env.example
 delete mode 100644 agbenchmark/tests/regression/regression_tests.txt
 delete mode 100644 agent/agbenchmark_run.py

diff --git a/.env.example b/.env.example
new file mode 100644
index 000000000..0a91118a9
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,4 @@
+OPENAI_API_KEY=
+AGENT_NAME=mini-agi
+AGENT_TIMEOUT=60
+MOCK_TEST=False
\ No newline at end of file
diff --git a/README.md b/README.md
index 0ad0cf345..794279478 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ class TestSomething(CategoryChallenge):
     """Testing if LLM can read a file"""
 
     @pytest.mark.parametrize(
-        "server_response",
+        "run_agent",
         [(data.task, data.mock_func)],
         indirect=True,
     )
diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index d159296b1..f644abc4a 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -1,12 +1,63 @@
 import os
 import glob
+import pytest
+from abc import ABC, abstractmethod
 from agbenchmark.challenges.define_task_types import Ground
+from agbenchmark.challenges.define_task_types import ChallengeData
+from dotenv import load_dotenv, set_key
 
+load_dotenv()
 
-class Challenge:
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
+
+
+class Challenge(ABC):
     """The parent class to all specific challenges classes.
     Defines helper methods for running a challenge"""
 
+    @abstractmethod
+    def get_file_path(self) -> str:
+        """This should be implemented by any class which inherits from BasicChallenge"""
+        pass
+
+    @property
+    def data(self) -> ChallengeData:
+        return ChallengeData.deserialize(self.get_file_path())
+
+    @property
+    def mock(self):
+        return self.data.mock.mock_func if self.data.mock else None
+
+    @property
+    def task(self):
+        return (
+            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
+        )
+
+    @property
+    def dependencies(self) -> list:
+        print("self.data.dependencies", self.data.dependencies)
+        return self.data.dependencies
+
+    @property
+    def name(self) -> str:
+        print("self.data.name", self.data.name)
+        return self.data.name
+
+    @pytest.mark.parametrize(
+        "run_agent",
+        [(task, mock)],
+        indirect=True,
+    )
+    @pytest.mark.parametrize(
+        "challenge_data",
+        [data],
+        indirect=True,
+    )
+    def test_method(self, workspace):
+        raise NotImplementedError
+
     @staticmethod
     def open_file(workspace: str, filename: str):
         script_dir = os.path.abspath(workspace)
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 694671218..7fc2361b5 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,6 +4,11 @@ import json
 import os
 
 
+class Mock(BaseModel):
+    mock_func: str
+    mock_task: Optional[str] = None
+
+
 class Info(BaseModel):
     difficulty: str
     description: str
@@ -12,17 +17,18 @@ class Info(BaseModel):
 
 class Ground(BaseModel):
     answer: str
-    should_contain: Optional[List[str]]
-    should_not_contain: Optional[List[str]]
+    should_contain: Optional[List[str]] = None
+    should_not_contain: Optional[List[str]] = None
     files: List[str]
 
 
 class ChallengeData(BaseModel):
+    name: str
     category: List[str]
     task: str
     dependencies: List[str]
     ground: Ground
-    mock_func: Optional[str] = None
+    mock: Optional[Mock] = None
     info: Info
 
     def serialize(self, path: str) -> None:
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 562d1c364..80c5e51eb 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,16 +1,20 @@
 {
+  "name": "retrieval1",
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": ["file_to_check.txt"]
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
   },
-  "mock_func": "basic_write_file_mock",
   "info": {
-    "difficulty": "easy",
+    "difficulty": "basic",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 2a7d92a71..0bd907d8a 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -4,30 +4,18 @@ from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r1_data.json")
-)
-
-
 class TestRetrieval1(RetrievalChallenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    def test_retrieval(self, workspace, current_challenge_data):
-        files_contents = self.open_files(workspace, data.ground.files)
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r1_data.json")
+
+    def test_method(self, workspace):
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
             print("Your score is:", score)
             scores.append(score)
 
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index d285627e5..9e5c1880f 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,5 @@
 {
   "hostname": "localhost",
   "port": 8080,
-  "workspace": "agbenchmark/mocks/workspace"
+  "workspace": "C:/Users/silen/miniagi"
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index b3b69f194..4edd4b5e0 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -4,18 +4,24 @@ import pytest
 import shutil
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 import requests
-from requests.exceptions import RequestException
 from agbenchmark.mocks.MockManager import MockManager
-from agbenchmark.challenges.define_task_types import ChallengeData
 import subprocess
+from agbenchmark.Challenge import Challenge
+from dotenv import load_dotenv
+
+load_dotenv()
 
 
 @pytest.fixture(scope="module")
-def config():
+def config(request):
     config_file = os.path.abspath("agbenchmark/config.json")
     print(f"Config file: {config_file}")
     with open(config_file, "r") as f:
         config = json.load(f)
+
+    if request.config.getoption("--mock"):
+        config["workspace"] = "agbenchmark/mocks/workspace"
+
     return config
 
 
@@ -34,43 +40,49 @@ def workspace(config):
             print(f"Failed to delete {file_path}. Reason: {e}")
 
 
+def pytest_addoption(parser):
+    parser.addoption("--mock", action="store_true", default=False)
+
+
+AGENT_NAME = os.getenv("AGENT_NAME")
+AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT")
+
+
 @pytest.fixture(autouse=True)
-def server_response(request, config):
+def run_agent(request, config):
     """Calling to get a response"""
     if isinstance(request.param, tuple):
         task = request.param[0]  # The task is passed in indirectly
-        mock_function_name = request.param[1]
+        mock_function_name = request.param[1] or None
     else:
         task = request.param
         mock_function_name = None
 
-    # get the current file's directory
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # construct the script's path
-    script_path = os.path.join(current_dir, "..", "agent", "agbenchmark_run.py")
-
-    # form the command
-    command = ["python", script_path, task]
+    if mock_function_name != None and (request.config.getoption("--mock")):
+        if mock_function_name:
+            mock_manager = MockManager(
+                task
+            )  # workspace doesn't need to be passed in, stays the same
+            print("Server unavailable, using mock", mock_function_name)
+            mock_manager.delegate(mock_function_name)
+        else:
+            print("No mock provided")
+    else:
+        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
 
-    # if mock_function_name:
-    #     mock_manager = MockManager(
-    #         task
-    #     )  # workspace doesn't need to be passed in, stays the same
-    #     print("Server unavailable, using mock", mock_function_name)
-    #     mock_manager.delegate(mock_function_name)
-    # else:
-    #     print("No mock provided")
+        try:
+            timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60
 
-    try:
-        # run the command and wait for it to complete
-        result = subprocess.run(
-            command, shell=True, check=True, text=True, capture_output=True
-        )
-        return result
-    except subprocess.CalledProcessError as e:
-        print(f"Subprocess failed with the following error:\n{e}")
-        # If the subprocess returns a non-zero exit status
+            subprocess.run(
+                ["python", "miniagi.py", task],
+                check=True,
+                cwd=path,
+                timeout=timeout
+                # text=True,
+                # capture_output=True
+            )
+        except subprocess.TimeoutExpired:
+            print("The subprocess has exceeded the time limit and was terminated.")
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
@@ -80,13 +92,13 @@ regression_manager = RegressionManager(regression_json)
 
 # this is to get the challenge_data from every test
 @pytest.fixture(autouse=True)
-def regression_data(request):
+def challenge_data(request):
     return request.param
 
 
 def pytest_runtest_makereport(item, call):
     if call.when == "call":
-        challenge_data = item.funcargs.get("regression_data", None)
+        challenge_data = item.funcargs.get("challenge_data", None)
         difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
         dependencies = challenge_data.dependencies if challenge_data else []
 
@@ -105,9 +117,9 @@ def pytest_runtest_makereport(item, call):
 
 def pytest_collection_modifyitems(items):
     """Called once all test items are collected. Used
-    to add regression marker to collected test items."""
+    to add regression and depends markers to collected test items."""
     for item in items:
-        print("pytest_collection_modifyitems", item.nodeid)
+        # regression add
         if item.nodeid.split("::")[1] in regression_manager.tests:
             print(regression_manager.tests)
             item.add_marker(pytest.mark.regression)
@@ -116,3 +128,26 @@ def pytest_collection_modifyitems(items):
 def pytest_sessionfinish():
     """Called at the end of the session to save regression tests"""
     regression_manager.save()
+
+
+# this is so that all tests can inherit from the Challenge class
+def pytest_generate_tests(metafunc):
+    if "challenge_data" in metafunc.fixturenames:
+        # Get the instance of the test class
+        test_class = metafunc.cls()
+
+        # Generate the parameters
+        params = test_class.data
+
+        # Add the parameters to the test function
+        metafunc.parametrize("challenge_data", [params], indirect=True)
+
+    if "run_agent" in metafunc.fixturenames:
+        # Get the instance of the test class
+        test_class = metafunc.cls()
+
+        # Generate the parameters
+        params = [(test_class.task, test_class.mock)]
+
+        # Add the parameters to the test function
+        metafunc.parametrize("run_agent", params, indirect=True)
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 6adcc09bf..ac612293a 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -2,6 +2,10 @@ import click
 import pytest
 import json
 import os
+from pathlib import Path
+from dotenv import load_dotenv, set_key
+
+load_dotenv()
 
 
 @click.group()
@@ -12,8 +16,8 @@ def cli():
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
 @click.option("--noreg", is_flag=True, help="Skip regression tests")
-def start(category, noreg):
-    """Start the benchmark tests. If a category flag is is provided, run the categories with that mark."""
+@click.option("--mock", is_flag=True, help="Run with mock")
+def start(category, noreg, mock):
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     config_file = "agbenchmark/config.json"
 
@@ -28,7 +32,8 @@ def start(category, noreg):
         )
         config["port"] = click.prompt("Please enter a new port", default=8080)
         config["workspace"] = click.prompt(
-            "Please enter a new workspace path", default="agbenchmark/mocks/workspace"
+            "Please enter a new workspace path",
+            default=os.path.join(Path.home(), "miniagi"),
         )
 
         with open(config_dir, "w") as f:
@@ -38,13 +43,17 @@ def start(category, noreg):
         with open(config_dir, "r") as f:
             config = json.load(f)
 
+    set_key(".env", "MOCK_TEST", "True" if mock else "False")
+    if mock:
+        config["workspace"] = "agbenchmark/mocks/workspace"
+
     # create workspace directory if it doesn't exist
     workspace_path = os.path.abspath(config["workspace"])
     if not os.path.exists(workspace_path):
         os.makedirs(workspace_path, exist_ok=True)
 
     regression_path = os.path.abspath(
-        "agbenchmark/tests/regression/regression_tests.txt"
+        "agbenchmark/tests/regression/regression_tests.json"
     )
     if not os.path.exists(regression_path):
         with open(regression_path, "a"):
@@ -74,6 +83,9 @@ def start(category, noreg):
         else:
             print("Running all categorys")  # run all categorys
 
+    if mock:
+        pytest_args.append("--mock")
+
     # Run pytest with the constructed arguments
     pytest.main(pytest_args)
 
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
index 563207405..6e7f73100 100644
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ b/agbenchmark/tests/basic_abilities/BasicChallenge.py
@@ -1,5 +1,7 @@
 import pytest
 from agbenchmark.Challenge import Challenge
+from agbenchmark.challenges.define_task_types import ChallengeData
+from abc import abstractmethod
 
 
 @pytest.mark.basic
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index 8c5ef62db..b21e2724b 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -1,13 +1,16 @@
 {
+  "name": "basic_read_file",
   "category": ["basic"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["test_write_file"],
+  "dependencies": ["basic_write_file"],
   "ground": {
     "answer": "random string: this is how we're doing",
     "should_contain": ["random string: this is how we're doing"],
     "files": ["file_to_check.txt"]
   },
-  "mock_func": "basic_read_file_mock",
+  "mock": {
+    "mock_func": "basic_read_file_mock"
+  },
   "info": {
     "description": "This reads the file quickly",
     "difficulty": "basic",
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 90946670c..68288a42c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -4,39 +4,30 @@ from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r_file_data.json")
-)
 
+class TestReadFile(BasicChallenge):
+    """Testing if LLM can read a file"""
 
-@pytest.fixture(scope="module", autouse=True)
-def setup_module(workspace):
-    if data.ground.should_contain:
+    @pytest.fixture(
+        scope="module", autouse=True
+    )  # this is specific to setting up a file for the test, not all tests have this
+    def setup_module(self, workspace):
         Challenge.write_to_file(
-            workspace, data.ground.files[0], "this is how we're doing"
+            workspace, self.data.ground.files[0], "this is how we're doing"
         )
 
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r_file_data.json")
 
-class TestReadFile(BasicChallenge):
-    """Testing if LLM can read a file"""
-
-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    @pytest.mark.depends(on=data.dependencies)
-    def test_read_file(self, workspace):
-        files_contents = self.open_files(workspace, data.ground.files)
+    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
+    def test_method(
+        self, workspace
+    ):  # run_test is a common name that all tests must implement
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
             print("Your score is:", score)
             scores.append(score)
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 037c5bd88..358ebb538 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -1,6 +1,7 @@
 {
+  "name": "basic_write_file",
   "category": ["basic"],
-  "task": "What is the capital of America?",
+  "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
   "ground": {
     "answer": "Washington",
@@ -8,7 +9,10 @@
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
     "files": [".txt"]
   },
-  "mock_func": "basic_write_file_mock",
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
+  },
   "info": {
     "difficulty": "basic",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 187378ff1..8caa6605a 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -3,31 +3,21 @@ from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "w_file_data.json")
-)
-
 
 class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
-    @pytest.mark.parametrize(
-        "server_response",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    @pytest.mark.parametrize(
-        "regression_data",
-        [data],
-        indirect=True,
-    )
-    @pytest.mark.depends(name="test_write_file")
-    def test_write_file(self, workspace):
-        files_contents = self.open_files(workspace, data.ground.files)
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
+
+    @pytest.mark.depends(on=[], name="basic_write_file")
+    def test_method(self, workspace):
+        print("my workspace is ", workspace)
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
-            score = self.scoring(file_content, data.ground)
+            score = self.scoring(file_content, self.data.ground)
             print("Your score is:", score)
             scores.append(score)
 
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 9e26dfeeb..8a6278fea 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1 +1,14 @@
-{}
\ No newline at end of file
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "basic_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/tests/regression/regression_tests.txt b/agbenchmark/tests/regression/regression_tests.txt
deleted file mode 100644
index 8af722f07..000000000
--- a/agbenchmark/tests/regression/regression_tests.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py": {
-        "difficulty": "easy",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file[regression_data0-server_response0]"
-    },
-    "agbenchmark/tests/basic_abilities/read_file/read_file_test.py": {
-        "difficulty": "basic",
-        "dependencies": [
-            "test_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_read_file[regression_data0-server_response0]"
-    }
-}
\ No newline at end of file
diff --git a/agent/agbenchmark_run.py b/agent/agbenchmark_run.py
deleted file mode 100644
index f509f5e66..000000000
--- a/agent/agbenchmark_run.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import argparse
-import subprocess
-import os
-
-
-def main(objective):
-    # get the current directory
-    current_dir = os.path.dirname(os.path.abspath(__file__))
-
-    # form the command
-    command = (
-        f"python {os.path.join(current_dir, 'mini-agi', 'miniagi.py')} {objective}"
-    )
-
-    # run the command
-    subprocess.run(command, shell=True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run miniagi.py with an objective.")
-    parser.add_argument(
-        "objective", type=str, help="The objective to pass to miniagi.py"
-    )
-
-    args = parser.parse_args()
-
-    main(args.objective)
diff --git a/poetry.lock b/poetry.lock
index d7939fbfe..7b2477bc6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -644,6 +644,20 @@ future-fstrings = "*"
 networkx = "*"
 pytest = ">=3"
 
+[[package]]
+name = "python-dotenv"
+version = "1.0.0"
+description = "Read key-value pairs from a .env file and set them as environment variables"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
+    {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+]
+
+[package.extras]
+cli = ["click (>=5.0)"]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -814,4 +828,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "a03dfa9938e062bdf564b7678df9dc9277c7c8e504f14f98084c5a2d497a8f7c"
+content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe"
diff --git a/pyproject.toml b/pyproject.toml
index 0a4f8ba73..043fe68a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
+python-dotenv = "^1.0.0"
 
 
 [build-system]
@@ -30,7 +31,7 @@ testpaths = [
 markers = [
     "retrieval",
     "regression",
-    "basic"
+    "basic",
 ]
 
 [tool.poetry.scripts]
-- 
cgit v1.2.3


From 76ee994d2c7a205799bc7c07adfa70f0c93102e9 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 27 Jun 2023 19:19:14 -0400
Subject: read mes, remove port and host from config, etc

---
 .env.example                                       |   1 -
 README.md                                          | 158 +++++++--------------
 agbenchmark/challenges/README.md                   |  31 ++--
 agbenchmark/config.json                            |   4 +-
 agbenchmark/mocks/basic_gpt_agent.py               |  20 ---
 agbenchmark/mocks/tests/basic_mocks.py             |  12 +-
 agbenchmark/start_benchmark.py                     |   4 -
 .../basic_abilities/read_file/read_file_test.py    |   5 +-
 .../basic_abilities/write_file/write_file_test.py  |   1 -
 agbenchmark/tests/regression/regression_tests.json |   7 -
 10 files changed, 75 insertions(+), 168 deletions(-)
 delete mode 100644 agbenchmark/mocks/basic_gpt_agent.py

diff --git a/.env.example b/.env.example
index 0a91118a9..7782d048e 100644
--- a/.env.example
+++ b/.env.example
@@ -1,4 +1,3 @@
-OPENAI_API_KEY=
 AGENT_NAME=mini-agi
 AGENT_TIMEOUT=60
 MOCK_TEST=False
\ No newline at end of file
diff --git a/README.md b/README.md
index 794279478..2c8daa0ad 100644
--- a/README.md
+++ b/README.md
@@ -2,131 +2,94 @@
 
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
+## As a user
+
+1. `pip install auto-gpt-benchmarks`
+2. Add boilerplate code to run and kill agent
+3. `agbenchmark start`
+   - `--category challenge_category` to run tests in a specific category
+   - `--mock` to only run mock tests if they exists for each test
+   - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
+4. We call boilerplate code for your agent
+5. Show pass rate of tests, logs, and any other metrics
+
+## Contributing
+
 ##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
 
-### To run the basic existing mock (June 21)
+### To run the existing mocks
 
 1. clone the repo `auto-gpt-benchmarks`
 2. `pip install poetry`
 3. `poetry shell`
 4. `poetry install`
-5. `agbenchmark start`
+5. `cp .env_example .env`
+6. `agbenchmark start --mock`
    Keep config the same and watch the logs :)
 
+### To run with mini-agi
+
+1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
+2. `pip install -r requirements.txt`
+3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
+4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
+
 - To add requirements `poetry add requirement`.
 
 Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
 
-If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `main` to last working commit
+If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit
 
 Let people know what beautiful code you write does, document everything well
 
 Share your progress :)
 
-## How this works
-
-1. `pip install auto-gpt-benchmarks`
-2. Add boilerplate code to start webserver to your agent (run loop and stop condition)
-3. `agbenchmark start --category challenge_category` remove challenge flag to run all tests. specify config of hostname, port, and workspace directory
-4. We call the server to run the agent for each test
-5. Show pass rate of tests, logs, and any other metrics
-
-### To run the basic existing mock (June 21)
-
-1. clone the repo `auto-gpt-benchmarks`
-2. `pip install poetry`
-3. `poetry shell`
-4. `poetry install`
-5. `agbenchmark start`
-   Keep config the same and watch the logs :)
-
-#### Bonuses
-
-- You can adds tests by git cloning auto-gpt-benchmarks to your repo
-- Agent is abstracted from benchmark, don't need to do any extra setup other then starting the server
-- Simple, easy to use
-- Don't have to deal with cloud or parallelization yet
-
 ### Pytest
 
-to create a test:
+an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
 
 ```python
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
-from ..CategoryChallenge import CategoryChallenge
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
-data = ChallengeData.deserialize(
-    os.path.join(os.path.dirname(__file__), "r_file_data.json")
-)
-
-class TestSomething(CategoryChallenge):
-    """Testing if LLM can read a file"""
-
-    @pytest.mark.parametrize(
-        "run_agent",
-        [(data.task, data.mock_func)],
-        indirect=True,
-    )
-    def test_retrieval(
-        self, workspace
-    ):
-        # scoring logic goes here
-```
-
-All challenges will inherit from parent class which has the mark
-
-```python
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
-```
-
-If you want to add a custom mark to a Challenge, you must specify it before the test definition
 
-```python
-@pytest.mark.other_mark
-def test_retrieval(self, workspace):
-```
+class TestWriteFile(BasicChallenge):
+    """Testing if LLM can write to a file"""
 
-To add a dependency to a challenge use the following
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
 
-```python
-# to defining what a test depends on
-from pytest_dependency import depends
-
-def test1(self, request, workspace):
-   depends(request, data.dependencies)
-# for defining a test as a dependency
-@pytest.mark.dependency()
-def test2
+    @pytest.mark.depends(on=[], name="basic_write_file")
+    def test_method(self, workspace):
+        # implement scoring logic by looking at workspace
 ```
 
-Ordering of challenges needs to be used in combination with the above to make sure it executes afterwards
+All challenges will inherit from parent class which has the mark and any specific methods for their category
 
 ```python
-@pytest.mark.run(order=1)
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
 ```
 
 To create a file to test a challenge, add this to the challenge file which will create a file before running the server
 
 ```python
-@pytest.fixture(scope="module", autouse=True)
-def setup_module(workspace):
-    if data.ground.should_contain:
+@pytest.fixture(
+        scope="module", autouse=True
+    )  # this is specific to setting up a file for the test, not all tests have this
+    def setup_module(self, workspace):
         Challenge.write_to_file(
-            workspace, data.ground.files[0], "this is how we're doing"
+            workspace, self.data.ground.files[0], "this is how we're doing"
         )
 ```
 
-## Api
-
-FastAPI with REST, import requests to call in auto-gpt-benchmarks. Boilerplate code given to agent project to start server
+#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
 
 ## Workspace
 
-Defined by the user on config
+If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
 
 #### Dataset
 
@@ -138,9 +101,9 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
 |-- auto-gpt-benchmarks/ **main project directory**
 | |-- metrics.py **combining scores, metrics, final evaluation**
 | |-- start_benchmark.py **entry point from cli**
-| |-- conftest.py **shared fixtures across all tests**
-| |-- Challenge.py **easy challenge creation class?**
-| |-- config.json **hostname, port, workspace folder**
+| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
+| |-- Challenge.py **easy challenge creation class**
+| |-- config.json **workspace folder**
 | |-- challenges/ **challenges across different domains**
 | | |-- adaptability/
 | | |-- basic_abilities/
@@ -149,28 +112,7 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
 | | |-- retrieval/
 | | |-- web_navigation/
 | | |-- writing/
-| |-- tests/ **challenges across different metrics**
-| | |-- basic_abilities/
-| | |-- interface/
-| |-- workspace/ **workspace related func**
-| | |-- **init**.py
-| | |-- workspace_manager.py **creation, deletion**
+| |-- tests/
+| | |-- basic_abilities/ **every llm should pass these challenges**
+| | |-- regression/ **challenges that already passed**
 ```
-
-### Easy Challenge Creation
-
-tbd, but potentially shared Challenge class that challenges instantiate as challenges need different utils/metrics for eval
-
-#### Written Challenges
-
-For code, writing we can create a reference text and use metrics like METEOR, BERTScore, BARTScore
-
-#### Validators
-
-Designed to handle specific types of output (e.g., text, code, structured data)
-
-#### Logging
-
-Log different requests coming in - write file, change file, etc. Maybe a db in the future for metrics, logs, etc
-
-Later: GitHub Actions integration, OpenAPI?, good versioning and backward compatibility
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index e457b85c4..9e74d19ce 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -4,7 +4,8 @@
 
 Input:
 
-- **category** (str[]): Category of the challenge such as 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
+- **name** (str): Name of the challenge.
+- **category** (str[]): Category of the challenge such as 'basic', 'retrieval', 'comprehension', etc. _this is not currently used. for the future it may be needed_
 - **task** (str): The task that the agent needs to solve.
 - **dependencies** (str[]): The dependencies that the challenge needs to run. Needs to be the full node to the test function.
 - **ground** (dict): The ground truth.
@@ -12,7 +13,9 @@ Input:
   - **should_contain** (list): The exact strings that are required in the final answer.
   - **should_not_contain** (list): The exact strings that should not be in the final answer.
   - **files** (list): Files that are used for retrieval. Can specify file here or an extension.
-- **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+- **mock** (dict): Mock response for testing.
+  - **mock_func** (str): Function to mock the agent's response. This is used for testing purposes.
+  - **mock_task** (str): Task to provide for the mock function.
 - **info** (dict): Additional info about the challenge.
   - **difficulty** (str): The difficulty of this query.
   - **description** (str): Description of the challenge.
@@ -22,24 +25,26 @@ Example:
 
 ```python
 {
+  "name": "basic_write_file",
   "category": ["basic"],
-  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": [
-    "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_write_file"
-  ],
+  "task": "Print the the capital of America to a .txt file",
+  "dependencies": [],
   "ground": {
-    "answer": "random string: this is how we're doing",
-    "should_contain": ["random string: this is how we're doing"],
-    "files": ["file_to_check.txt"]
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
   },
-  "mock_func": "basic_read_file_mock",
   "info": {
-    "description": "This reads the file quickly",
     "difficulty": "basic",
-    "side_effects": [""]
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
   }
 }
-
 ```
 
 Current Output:
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 9e5c1880f..3de1dd643 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,3 @@
 {
-  "hostname": "localhost",
-  "port": 8080,
-  "workspace": "C:/Users/silen/miniagi"
+  "hostname": "localhost"
 }
diff --git a/agbenchmark/mocks/basic_gpt_agent.py b/agbenchmark/mocks/basic_gpt_agent.py
deleted file mode 100644
index 6aac3d191..000000000
--- a/agbenchmark/mocks/basic_gpt_agent.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import json
-import openai
-
-
-def basic_gpt_agent(query) -> str:
-    response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo-0613", messages=[{"role": "user", "content": query}]
-    )
-
-    answer = response["choices"][0]["message"]["content"]  # type: ignore
-
-    print("QUERY       : ", query)
-    print("AGENT ANSWER: ", answer)
-
-    return answer
-
-
-if __name__ == "__main__":
-    # server boilerplate example here
-    basic_gpt_agent("")
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 550095b72..631b30c2c 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,5 +1,4 @@
 from agbenchmark.Challenge import Challenge
-from ..basic_gpt_agent import basic_gpt_agent
 
 
 def basic_read_file_mock(task: str, workspace: str):
@@ -18,9 +17,8 @@ def basic_write_file_mock(task: str, workspace: str):
     """
     This mock writes to a file (creates one if it doesn't exist)
     """
-
-    # Call the basic_gpt_agent to get a response.
-    response = basic_gpt_agent(task)
-
-    # Open the file in write mode.
-    Challenge.write_to_file(workspace, "file_to_check.txt", response)
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "Washington DC is the capital of the United States of America",
+    )
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index ac612293a..c9f3643cc 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -27,10 +27,6 @@ def start(category, noreg, mock):
     if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0:
         config = {}
 
-        config["hostname"] = click.prompt(
-            "\nPlease enter a new hostname", default="localhost"
-        )
-        config["port"] = click.prompt("Please enter a new port", default=8080)
         config["workspace"] = click.prompt(
             "Please enter a new workspace path",
             default=os.path.join(Path.home(), "miniagi"),
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 68288a42c..f99ae608c 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,5 +1,4 @@
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.Challenge import Challenge
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
@@ -8,9 +7,7 @@ import os
 class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
-    @pytest.fixture(
-        scope="module", autouse=True
-    )  # this is specific to setting up a file for the test, not all tests have this
+    @pytest.fixture(scope="module", autouse=True)
     def setup_module(self, workspace):
         Challenge.write_to_file(
             workspace, self.data.ground.files[0], "this is how we're doing"
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 8caa6605a..39c73b163 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,5 +1,4 @@
 import pytest
-from agbenchmark.challenges.define_task_types import ChallengeData
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
 
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 8a6278fea..384f9e7c6 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -3,12 +3,5 @@
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "basic_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From 0c81585a538facff2b62c22d5b896df00cea9c17 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 27 Jun 2023 22:17:42 -0400
Subject: Update README.md (#41)

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2c8daa0ad..504132ddb 100644
--- a/README.md
+++ b/README.md
@@ -73,8 +73,7 @@ class BasicChallenge(Challenge):
     pass
 ```
 
-To create a file to test a challenge, add this to the challenge file which will create a file before running the server
-
+Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. 
 ```python
 @pytest.fixture(
         scope="module", autouse=True
-- 
cgit v1.2.3


From ac5af736963dac95969f0cb3d0f99480a0a4f401 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Wed, 28 Jun 2023 21:28:46 -0400
Subject: trying to get kill process

---
 agbenchmark/config.json                            |  8 +-
 agbenchmark/conftest.py                            | 70 ++++++++++++----
 agbenchmark/tests/regression/regression_tests.json |  7 ++
 poetry.lock                                        | 93 +++++++++++++++++++++-
 pyproject.toml                                     |  2 +
 5 files changed, 161 insertions(+), 19 deletions(-)

diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 3de1dd643..d95b8e443 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,3 +1,9 @@
 {
-  "hostname": "localhost"
+  "workspace": "C:\\Users\\silen\\miniagi",
+  "cutoff": {
+    "type": "time",
+    "user_prompt": "Press enter to continue or abort this action by typing feedback:",
+    "user_input": "\n",
+    "count": 5
+  }
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 4edd4b5e0..2590ce781 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,11 +2,10 @@ import json
 import os
 import pytest
 import shutil
+import subprocess
+import sys
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
-import requests
 from agbenchmark.mocks.MockManager import MockManager
-import subprocess
-from agbenchmark.Challenge import Challenge
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -44,8 +43,16 @@ def pytest_addoption(parser):
     parser.addoption("--mock", action="store_true", default=False)
 
 
+def check_cycle_count(cycle_count: int, cutoff: int, proc):
+    """Increment, print, and check cycle count."""
+    cycle_count += 1
+    print(f"Cycle count: {cycle_count}")
+    if cycle_count >= cutoff:
+        proc.terminate(force=True)
+    return cycle_count
+
+
 AGENT_NAME = os.getenv("AGENT_NAME")
-AGENT_TIMEOUT = os.getenv("AGENT_TIMEOUT")
 
 
 @pytest.fixture(autouse=True)
@@ -70,19 +77,48 @@ def run_agent(request, config):
     else:
         path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
 
-        try:
-            timeout = int(AGENT_TIMEOUT) if AGENT_TIMEOUT is not None else 60
-
-            subprocess.run(
-                ["python", "miniagi.py", task],
-                check=True,
-                cwd=path,
-                timeout=timeout
-                # text=True,
-                # capture_output=True
-            )
-        except subprocess.TimeoutExpired:
-            print("The subprocess has exceeded the time limit and was terminated.")
+        timeout = sys.maxsize
+
+        if config["cutoff"]["type"] == "time":
+            timeout = config["cutoff"]["count"] or 60
+            
+        from pexpect.popen_spawn import PopenSpawn
+
+        print(f"Running {task} with timeout {timeout}")
+
+        # Starting the subprocess using pexpect
+        proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
+
+        print("proc", proc)
+
+        cycle_count = 0
+
+        while True:
+            try:
+                # If we get the prompt for user input, we send "\n"
+                if config["cutoff"]["type"] == "user_input":
+                    proc.expect([config["cutoff"]["user_prompt"]])
+                    proc.sendline(config["cutoff"]["user_input"])
+                    cycle_count = check_cycle_count(
+                        cycle_count, config["cutoff"]["count"], proc
+                    )
+                elif config["cutoff"]["type"] == "cycle_count":
+                    match = proc.expect([r"Cycle count: (\d+)"])
+                    if match is not None:
+                        cycle_count = int(match.group(1))  # type: ignore
+                        cycle_count = check_cycle_count(
+                            cycle_count, config["cutoff"]["count"], proc
+                        )
+
+                # for cutoff type "time", just let it run until timeout
+            except expect.TIMEOUT:
+                print("The subprocess has exceeded the time limit and was terminated.")
+                break
+            except expect.EOF:
+                print("The subprocess has finished running.")
+                break
+
+        proc.close()
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 384f9e7c6..8a6278fea 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -3,5 +3,12 @@
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "basic_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
     }
 }
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index 7b2477bc6..a460f988d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -538,6 +538,20 @@ files = [
     {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
 ]
 
+[[package]]
+name = "pexpect"
+version = "4.8.0"
+description = "Pexpect allows easy control of interactive console applications."
+optional = false
+python-versions = "*"
+files = [
+    {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"},
+    {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"},
+]
+
+[package.dependencies]
+ptyprocess = ">=0.5"
+
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -553,6 +567,43 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "psutil"
+version = "5.9.5"
+description = "Cross-platform lib for process and system monitoring in Python."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"},
+    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"},
+    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"},
+    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"},
+    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"},
+    {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"},
+    {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"},
+    {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"},
+    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"},
+    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"},
+    {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"},
+    {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"},
+    {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"},
+    {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"},
+]
+
+[package.extras]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+description = "Run a subprocess in a pseudo terminal"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
+    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
+]
+
 [[package]]
 name = "pydantic"
 version = "1.10.9"
@@ -658,6 +709,29 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]
 
+[[package]]
+name = "pywin32"
+version = "306"
+description = "Python for Window Extensions"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"},
+    {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"},
+    {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"},
+    {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"},
+    {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"},
+    {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"},
+    {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"},
+    {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"},
+    {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"},
+    {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"},
+    {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"},
+    {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"},
+    {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"},
+    {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"},
+]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -738,6 +812,23 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
+[[package]]
+name = "wexpect"
+version = "4.0.0"
+description = "Windows alternative of pexpect"
+optional = false
+python-versions = "*"
+files = [
+    {file = "wexpect-4.0.0.tar.gz", hash = "sha256:de9e739e78ec4d74a39bf8499904dacb6c594007a674fb7e10752c9b131f6522"},
+]
+
+[package.dependencies]
+psutil = ">=5.0.0"
+pywin32 = ">=220"
+
+[package.extras]
+test = ["codecov", "coverage", "pyinstaller", "setuptools (>=38.0)", "tox", "twine"]
+
 [[package]]
 name = "yarl"
 version = "1.9.2"
@@ -828,4 +919,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f8de5e973c92360108aaca1cecc2fdd505f10a9c2975b46c83ea9c24b4af3cfe"
+content-hash = "8ab722acade739b9fb841ecae3b8cabd4f1d8a355864573a93d9faa11dcffb90"
diff --git a/pyproject.toml b/pyproject.toml
index 043fe68a2..af9688d14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,8 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
 python-dotenv = "^1.0.0"
+pexpect = "^4.8.0"
+wexpect = "^4.0.0"
 
 
 [build-system]
-- 
cgit v1.2.3


From fce421fb335107cddd9fd60b32e91902be7b5eae Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 29 Jun 2023 20:51:23 -0400
Subject: moving logic to benchmark.py file

---
 agbenchmark/benchmark.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++
 agbenchmark/conftest.py  | 61 ++-------------------------------------------
 2 files changed, 67 insertions(+), 59 deletions(-)
 create mode 100644 agbenchmark/benchmark.py

diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py
new file mode 100644
index 000000000..6dc3b2312
--- /dev/null
+++ b/agbenchmark/benchmark.py
@@ -0,0 +1,65 @@
+import os
+import sys
+import pexpect as expect
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def check_cycle_count(cycle_count: int, cutoff: int, proc):
+    """Increment, print, and check cycle count."""
+    cycle_count += 1
+    print(f"Cycle count: {cycle_count}")
+    if cycle_count >= cutoff:
+        proc.terminate(force=True)
+    return cycle_count
+
+
+AGENT_NAME = os.getenv("AGENT_NAME")
+
+
+def run_agnostic(config, task):
+    path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
+
+    timeout = sys.maxsize
+
+    if config["cutoff"]["type"] == "time":
+        timeout = config["cutoff"]["count"] or 60
+
+    # from pexpect.popen_spawn import PopenSpawn
+
+    print(f"Running {task} with timeout {timeout}")
+
+    # Starting the subprocess using pexpect
+    proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
+
+    print("proc", proc)
+
+    cycle_count = 0
+
+    while True:
+        try:
+            # If we get the prompt for user input, we send "\n"
+            if config["cutoff"]["type"] == "user_input":
+                proc.expect([config["cutoff"]["user_prompt"]])
+                proc.sendline(config["cutoff"]["user_input"])
+                cycle_count = check_cycle_count(
+                    cycle_count, config["cutoff"]["count"], proc
+                )
+            elif config["cutoff"]["type"] == "cycle_count":
+                match = proc.expect([r"Cycle count: (\d+)"])
+                if match is not None:
+                    cycle_count = int(match.group(1))  # type: ignore
+                    cycle_count = check_cycle_count(
+                        cycle_count, config["cutoff"]["count"], proc
+                    )
+
+            # for cutoff type "time", just let it run until timeout
+        except expect.TIMEOUT:
+            print("The subprocess has exceeded the time limit and was terminated.")
+            break
+        except expect.EOF:
+            print("The subprocess has finished running.")
+            break
+
+    proc.close()
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 2590ce781..25510e42b 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,9 +6,7 @@ import subprocess
 import sys
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 from agbenchmark.mocks.MockManager import MockManager
-from dotenv import load_dotenv
-
-load_dotenv()
+from agbenchmark.benchmark import run_agnostic
 
 
 @pytest.fixture(scope="module")
@@ -43,18 +41,6 @@ def pytest_addoption(parser):
     parser.addoption("--mock", action="store_true", default=False)
 
 
-def check_cycle_count(cycle_count: int, cutoff: int, proc):
-    """Increment, print, and check cycle count."""
-    cycle_count += 1
-    print(f"Cycle count: {cycle_count}")
-    if cycle_count >= cutoff:
-        proc.terminate(force=True)
-    return cycle_count
-
-
-AGENT_NAME = os.getenv("AGENT_NAME")
-
-
 @pytest.fixture(autouse=True)
 def run_agent(request, config):
     """Calling to get a response"""
@@ -75,50 +61,7 @@ def run_agent(request, config):
         else:
             print("No mock provided")
     else:
-        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
-
-        timeout = sys.maxsize
-
-        if config["cutoff"]["type"] == "time":
-            timeout = config["cutoff"]["count"] or 60
-            
-        from pexpect.popen_spawn import PopenSpawn
-
-        print(f"Running {task} with timeout {timeout}")
-
-        # Starting the subprocess using pexpect
-        proc = PopenSpawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
-
-        print("proc", proc)
-
-        cycle_count = 0
-
-        while True:
-            try:
-                # If we get the prompt for user input, we send "\n"
-                if config["cutoff"]["type"] == "user_input":
-                    proc.expect([config["cutoff"]["user_prompt"]])
-                    proc.sendline(config["cutoff"]["user_input"])
-                    cycle_count = check_cycle_count(
-                        cycle_count, config["cutoff"]["count"], proc
-                    )
-                elif config["cutoff"]["type"] == "cycle_count":
-                    match = proc.expect([r"Cycle count: (\d+)"])
-                    if match is not None:
-                        cycle_count = int(match.group(1))  # type: ignore
-                        cycle_count = check_cycle_count(
-                            cycle_count, config["cutoff"]["count"], proc
-                        )
-
-                # for cutoff type "time", just let it run until timeout
-            except expect.TIMEOUT:
-                print("The subprocess has exceeded the time limit and was terminated.")
-                break
-            except expect.EOF:
-                print("The subprocess has finished running.")
-                break
-
-        proc.close()
+        run_agnostic(config, task)
 
 
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
-- 
cgit v1.2.3


From 2987d71264c7ffb0b6184e28e17c503aef5b4681 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 30 Jun 2023 10:50:54 -0400
Subject: moving run agent to tests & agnostic run working

---
 .env.example                                       |   2 +-
 agbenchmark/Challenge.py                           |  16 +--
 agbenchmark/agent_interface.py                     | 108 +++++++++++++++++++++
 agbenchmark/benchmark.py                           |  65 -------------
 agbenchmark/challenges/retrieval/r1/r1_test.py     |   7 +-
 agbenchmark/config.json                            |   9 +-
 agbenchmark/conftest.py                            |  37 -------
 agbenchmark/mocks/workspace/file_to_check.txt      |   1 +
 .../basic_abilities/read_file/read_file_test.py    |   7 +-
 .../basic_abilities/write_file/write_file_test.py  |   6 +-
 agbenchmark/tests/regression/regression_tests.json |   9 +-
 agent/hook.py                                      |  10 ++
 pyproject.toml                                     |   2 -
 13 files changed, 144 insertions(+), 135 deletions(-)
 create mode 100644 agbenchmark/agent_interface.py
 delete mode 100644 agbenchmark/benchmark.py
 create mode 100644 agbenchmark/mocks/workspace/file_to_check.txt
 create mode 100644 agent/hook.py

diff --git a/.env.example b/.env.example
index 7782d048e..e50ed58a5 100644
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,3 @@
 AGENT_NAME=mini-agi
-AGENT_TIMEOUT=60
+ENVIRONMENT=local
 MOCK_TEST=False
\ No newline at end of file
diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index f644abc4a..7b1e4df04 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -4,7 +4,7 @@ import pytest
 from abc import ABC, abstractmethod
 from agbenchmark.challenges.define_task_types import Ground
 from agbenchmark.challenges.define_task_types import ChallengeData
-from dotenv import load_dotenv, set_key
+from dotenv import load_dotenv
 
 load_dotenv()
 
@@ -40,22 +40,24 @@ class Challenge(ABC):
         print("self.data.dependencies", self.data.dependencies)
         return self.data.dependencies
 
+    def setup_challenge(self, config):
+        from agbenchmark.agent_interface import run_agent
+
+        print("SETTING UP CHALLENGE...")
+
+        run_agent(self.task, self.mock, config)
+
     @property
     def name(self) -> str:
         print("self.data.name", self.data.name)
         return self.data.name
 
-    @pytest.mark.parametrize(
-        "run_agent",
-        [(task, mock)],
-        indirect=True,
-    )
     @pytest.mark.parametrize(
         "challenge_data",
         [data],
         indirect=True,
     )
-    def test_method(self, workspace):
+    def test_method(self, config):
         raise NotImplementedError
 
     @staticmethod
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
new file mode 100644
index 000000000..eba26fc18
--- /dev/null
+++ b/agbenchmark/agent_interface.py
@@ -0,0 +1,108 @@
+import os
+import sys
+import subprocess
+import time
+from agbenchmark.mocks.MockManager import MockManager
+from multiprocessing import Process, Pipe
+
+from agent.hook import run_specific_agent
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+MOCK_FLAG = os.getenv("MOCK_TEST")
+
+
+def run_agent(task, mock_func, config):
+    """Calling to get a response"""
+
+    if mock_func == None and MOCK_FLAG == "True":
+        print("No mock provided")
+    elif MOCK_FLAG == "True":
+        mock_manager = MockManager(
+            task
+        )  # workspace doesn't need to be passed in, stays the same
+        print("Server unavailable, using mock", mock_func)
+        mock_manager.delegate(mock_func)
+    else:
+        if config["agent"]["type"] == "python":
+            run_agent_function(config, task)
+        elif config["agent"]["type"] == "script":
+            run_agent_command(config, task)
+
+
+ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
+
+
+def run_agent_command(config, task):
+    path = config["agent"]["path"]
+
+    if ENVIRONMENT == "local":
+        AGENT_NAME = os.getenv("AGENT_NAME")
+        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
+
+    timeout = config["agent"]["cutoff"] or sys.maxsize
+    print(f"Running {task} with timeout {timeout}")
+
+    command_from_config = config["agent"]["script"]
+    command_list = command_from_config.split()
+
+    # replace '{}' with the task
+    command_list = [cmd if cmd != "{}" else task for cmd in command_list]
+    print("path, command_list", path, command_list)
+    start_time = time.time()
+    proc = subprocess.Popen(
+        command_list,
+        cwd=path,
+        shell=True,
+    )
+
+    while True:
+        if time.time() - start_time > timeout:
+            print("The subprocess has exceeded the time limit and was terminated.")
+            proc.terminate()
+            break
+
+        if proc.poll() is not None:
+            print("The subprocess has finished running.")
+            break
+
+
+def run_agent_function(config, task):
+    timeout = (
+        config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize
+    )
+    print(
+        f"Running Python function '{config['agent']['function']}' with timeout {timeout}"
+    )
+
+    parent_conn, child_conn = Pipe()
+    process = Process(target=run_specific_agent, args=(task, child_conn))
+    process.start()
+    start_time = time.time()
+
+    while True:
+        if parent_conn.poll():  # Check if there's a new message from the child process
+            response, cycle_count = parent_conn.recv()
+            print(f"Cycle {cycle_count}: {response}")
+
+            if cycle_count >= config["cutoff"]["count"]:
+                print(
+                    f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating."
+                )
+                child_conn.send("terminate")
+                break
+
+        if time.time() - start_time > timeout:
+            print("The Python function has exceeded the time limit and was terminated.")
+            child_conn.send(
+                "terminate"
+            )  # Send a termination signal to the child process
+            break
+
+        if not process.is_alive():
+            print("The Python function has finished running.")
+            break
+
+    process.join()
diff --git a/agbenchmark/benchmark.py b/agbenchmark/benchmark.py
deleted file mode 100644
index 6dc3b2312..000000000
--- a/agbenchmark/benchmark.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import os
-import sys
-import pexpect as expect
-from dotenv import load_dotenv
-
-load_dotenv()
-
-
-def check_cycle_count(cycle_count: int, cutoff: int, proc):
-    """Increment, print, and check cycle count."""
-    cycle_count += 1
-    print(f"Cycle count: {cycle_count}")
-    if cycle_count >= cutoff:
-        proc.terminate(force=True)
-    return cycle_count
-
-
-AGENT_NAME = os.getenv("AGENT_NAME")
-
-
-def run_agnostic(config, task):
-    path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
-
-    timeout = sys.maxsize
-
-    if config["cutoff"]["type"] == "time":
-        timeout = config["cutoff"]["count"] or 60
-
-    # from pexpect.popen_spawn import PopenSpawn
-
-    print(f"Running {task} with timeout {timeout}")
-
-    # Starting the subprocess using pexpect
-    proc = expect.spawn("python", ["miniagi.py", task], timeout=timeout, cwd=path)
-
-    print("proc", proc)
-
-    cycle_count = 0
-
-    while True:
-        try:
-            # If we get the prompt for user input, we send "\n"
-            if config["cutoff"]["type"] == "user_input":
-                proc.expect([config["cutoff"]["user_prompt"]])
-                proc.sendline(config["cutoff"]["user_input"])
-                cycle_count = check_cycle_count(
-                    cycle_count, config["cutoff"]["count"], proc
-                )
-            elif config["cutoff"]["type"] == "cycle_count":
-                match = proc.expect([r"Cycle count: (\d+)"])
-                if match is not None:
-                    cycle_count = int(match.group(1))  # type: ignore
-                    cycle_count = check_cycle_count(
-                        cycle_count, config["cutoff"]["count"], proc
-                    )
-
-            # for cutoff type "time", just let it run until timeout
-        except expect.TIMEOUT:
-            print("The subprocess has exceeded the time limit and was terminated.")
-            break
-        except expect.EOF:
-            print("The subprocess has finished running.")
-            break
-
-    proc.close()
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 0bd907d8a..b679a731d 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,6 +1,4 @@
-import pytest
 from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
-from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 import os
 
 
@@ -10,8 +8,9 @@ class TestRetrieval1(RetrievalChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r1_data.json")
 
-    def test_method(self, workspace):
-        files_contents = self.open_files(workspace, self.data.ground.files)
+    def test_method(self, config):
+        self.setup_challenge(config)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index d95b8e443..7388085dc 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,9 +1,10 @@
 {
   "workspace": "C:\\Users\\silen\\miniagi",
-  "cutoff": {
-    "type": "time",
-    "user_prompt": "Press enter to continue or abort this action by typing feedback:",
+  "agent": {
+    "type": "script",
+    "path": "",
+    "script": "python miniagi.py {}",
     "user_input": "\n",
-    "count": 5
+    "cutoff": 60
   }
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 25510e42b..0f1fc7bb2 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,11 +2,7 @@ import json
 import os
 import pytest
 import shutil
-import subprocess
-import sys
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
-from agbenchmark.mocks.MockManager import MockManager
-from agbenchmark.benchmark import run_agnostic
 
 
 @pytest.fixture(scope="module")
@@ -41,29 +37,6 @@ def pytest_addoption(parser):
     parser.addoption("--mock", action="store_true", default=False)
 
 
-@pytest.fixture(autouse=True)
-def run_agent(request, config):
-    """Calling to get a response"""
-    if isinstance(request.param, tuple):
-        task = request.param[0]  # The task is passed in indirectly
-        mock_function_name = request.param[1] or None
-    else:
-        task = request.param
-        mock_function_name = None
-
-    if mock_function_name != None and (request.config.getoption("--mock")):
-        if mock_function_name:
-            mock_manager = MockManager(
-                task
-            )  # workspace doesn't need to be passed in, stays the same
-            print("Server unavailable, using mock", mock_function_name)
-            mock_manager.delegate(mock_function_name)
-        else:
-            print("No mock provided")
-    else:
-        run_agnostic(config, task)
-
-
 regression_json = "agbenchmark/tests/regression/regression_tests.json"
 
 regression_manager = RegressionManager(regression_json)
@@ -120,13 +93,3 @@ def pytest_generate_tests(metafunc):
 
         # Add the parameters to the test function
         metafunc.parametrize("challenge_data", [params], indirect=True)
-
-    if "run_agent" in metafunc.fixturenames:
-        # Get the instance of the test class
-        test_class = metafunc.cls()
-
-        # Generate the parameters
-        params = [(test_class.task, test_class.mock)]
-
-        # Add the parameters to the test function
-        metafunc.parametrize("run_agent", params, indirect=True)
diff --git a/agbenchmark/mocks/workspace/file_to_check.txt b/agbenchmark/mocks/workspace/file_to_check.txt
new file mode 100644
index 000000000..48dc8cff1
--- /dev/null
+++ b/agbenchmark/mocks/workspace/file_to_check.txt
@@ -0,0 +1 @@
+Washington DC is the capital of the United States of America
\ No newline at end of file
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index f99ae608c..c0aaa7f93 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -17,10 +17,9 @@ class TestReadFile(BasicChallenge):
         return os.path.join(os.path.dirname(__file__), "r_file_data.json")
 
     @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
-    def test_method(
-        self, workspace
-    ):  # run_test is a common name that all tests must implement
-        files_contents = self.open_files(workspace, self.data.ground.files)
+    def test_method(self, config):
+        self.setup_challenge(config)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 39c73b163..306375ddd 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -10,9 +10,9 @@ class TestWriteFile(BasicChallenge):
         return os.path.join(os.path.dirname(__file__), "w_file_data.json")
 
     @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
-        print("my workspace is ", workspace)
-        files_contents = self.open_files(workspace, self.data.ground.files)
+    def test_method(self, config):
+        self.setup_challenge(config)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index 8a6278fea..d13b763c7 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -2,13 +2,6 @@
     "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0-run_agent0]"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "basic_write_file"
-        ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py::TestReadFile::test_method[challenge_data0-run_agent0]"
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]"
     }
 }
\ No newline at end of file
diff --git a/agent/hook.py b/agent/hook.py
new file mode 100644
index 000000000..6fa534180
--- /dev/null
+++ b/agent/hook.py
@@ -0,0 +1,10 @@
+async def run_specific_agent(task, conn):
+    while (
+        not conn.poll()
+    ):  # Check if there's a termination signal from the main process
+        response, cycle_count = await run_agent(
+            task
+        )  # run the agent and get the response and cycle count
+
+        # Send response and cycle count back to the main process
+        conn.send((response, cycle_count))
diff --git a/pyproject.toml b/pyproject.toml
index af9688d14..043fe68a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,6 @@ openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
 python-dotenv = "^1.0.0"
-pexpect = "^4.8.0"
-wexpect = "^4.0.0"
 
 
 [build-system]
-- 
cgit v1.2.3


From 7c352b745ec90486826289ed735800197e95cd80 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 30 Jun 2023 11:55:43 -0400
Subject: integrate config, agent_interface just func, hook

---
 agbenchmark/Challenge.py                           |   5 +-
 agbenchmark/agent_interface.py                     | 118 +++++++--------------
 agbenchmark/config.json                            |   9 +-
 agbenchmark/start_benchmark.py                     |  12 ++-
 agbenchmark/tests/regression/regression_tests.json |   8 +-
 agent/benchmarks.py                                |  15 +++
 agent/hook.py                                      |  10 --
 7 files changed, 70 insertions(+), 107 deletions(-)
 create mode 100644 agent/benchmarks.py
 delete mode 100644 agent/hook.py

diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
index 7b1e4df04..d7a2bdc9b 100644
--- a/agbenchmark/Challenge.py
+++ b/agbenchmark/Challenge.py
@@ -23,6 +23,7 @@ class Challenge(ABC):
 
     @property
     def data(self) -> ChallengeData:
+        # TODO: make it so that this is cached somewhere to just call self.deserialized_data
         return ChallengeData.deserialize(self.get_file_path())
 
     @property
@@ -37,19 +38,15 @@ class Challenge(ABC):
 
     @property
     def dependencies(self) -> list:
-        print("self.data.dependencies", self.data.dependencies)
         return self.data.dependencies
 
     def setup_challenge(self, config):
         from agbenchmark.agent_interface import run_agent
 
-        print("SETTING UP CHALLENGE...")
-
         run_agent(self.task, self.mock, config)
 
     @property
     def name(self) -> str:
-        print("self.data.name", self.data.name)
         return self.data.name
 
     @pytest.mark.parametrize(
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index eba26fc18..2ff2acf30 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,12 +1,9 @@
 import os
-import sys
-import subprocess
+import importlib
 import time
 from agbenchmark.mocks.MockManager import MockManager
 from multiprocessing import Process, Pipe
 
-from agent.hook import run_specific_agent
-
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -26,83 +23,48 @@ def run_agent(task, mock_func, config):
         print("Server unavailable, using mock", mock_func)
         mock_manager.delegate(mock_func)
     else:
-        if config["agent"]["type"] == "python":
-            run_agent_function(config, task)
-        elif config["agent"]["type"] == "script":
-            run_agent_command(config, task)
-
-
-ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
-
-
-def run_agent_command(config, task):
-    path = config["agent"]["path"]
-
-    if ENVIRONMENT == "local":
-        AGENT_NAME = os.getenv("AGENT_NAME")
-        path = os.path.join(os.getcwd(), f"agent\\{AGENT_NAME}")
-
-    timeout = config["agent"]["cutoff"] or sys.maxsize
-    print(f"Running {task} with timeout {timeout}")
-
-    command_from_config = config["agent"]["script"]
-    command_list = command_from_config.split()
-
-    # replace '{}' with the task
-    command_list = [cmd if cmd != "{}" else task for cmd in command_list]
-    print("path, command_list", path, command_list)
-    start_time = time.time()
-    proc = subprocess.Popen(
-        command_list,
-        cwd=path,
-        shell=True,
-    )
-
-    while True:
-        if time.time() - start_time > timeout:
-            print("The subprocess has exceeded the time limit and was terminated.")
-            proc.terminate()
-            break
-
-        if proc.poll() is not None:
-            print("The subprocess has finished running.")
-            break
-
-
-def run_agent_function(config, task):
-    timeout = (
-        config["cutoff"]["count"] if config["cutoff"]["type"] == "time" else sys.maxsize
-    )
-    print(
-        f"Running Python function '{config['agent']['function']}' with timeout {timeout}"
-    )
-
-    parent_conn, child_conn = Pipe()
-    process = Process(target=run_specific_agent, args=(task, child_conn))
-    process.start()
-    start_time = time.time()
-
-    while True:
-        if parent_conn.poll():  # Check if there's a new message from the child process
-            response, cycle_count = parent_conn.recv()
-            print(f"Cycle {cycle_count}: {response}")
-
-            if cycle_count >= config["cutoff"]["count"]:
+        timeout = config["cutoff"]
+        print(f"Running Python function '{config['func_path']}' with timeout {timeout}")
+
+        parent_conn, child_conn = Pipe()
+
+        # Import the specific agent dynamically
+        module_name = config["func_path"].replace("/", ".").rstrip(".py")
+        module = importlib.import_module(module_name)
+        run_specific_agent = getattr(module, "run_specific_agent")
+
+        process = Process(target=run_specific_agent, args=(task, child_conn))
+        process.start()
+        start_time = time.time()
+
+        while True:
+            if (
+                parent_conn.poll()
+            ):  # Check if there's a new message from the child process
+                response, cycle_count = parent_conn.recv()
+                print(f"Cycle {cycle_count}: {response}")
+
+                if cycle_count >= config["cutoff"]:
+                    print(
+                        f"Cycle count has reached the limit of {config['cutoff']}. Terminating."
+                    )
+                    child_conn.send("terminate")
+                    break
+
+            if time.time() - start_time > timeout:
                 print(
-                    f"Cycle count has reached the limit of {config['cutoff']['count']}. Terminating."
+                    "The Python function has exceeded the time limit and was terminated."
                 )
-                child_conn.send("terminate")
+                child_conn.send(
+                    "terminate"
+                )  # Send a termination signal to the child process
                 break
 
-        if time.time() - start_time > timeout:
-            print("The Python function has exceeded the time limit and was terminated.")
-            child_conn.send(
-                "terminate"
-            )  # Send a termination signal to the child process
-            break
+            if not process.is_alive():
+                print("The Python function has finished running.")
+                break
 
-        if not process.is_alive():
-            print("The Python function has finished running.")
-            break
+        process.join()
 
-    process.join()
+
+ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 7388085dc..d9b42ca42 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,10 +1,5 @@
 {
   "workspace": "C:\\Users\\silen\\miniagi",
-  "agent": {
-    "type": "script",
-    "path": "",
-    "script": "python miniagi.py {}",
-    "user_input": "\n",
-    "cutoff": 60
-  }
+  "func_path": "agent/benchmarks.py",
+  "cutoff": 60
 }
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index c9f3643cc..fe395cd21 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -29,7 +29,17 @@ def start(category, noreg, mock):
 
         config["workspace"] = click.prompt(
             "Please enter a new workspace path",
-            default=os.path.join(Path.home(), "miniagi"),
+            default=os.path.join(Path.home(), "workspace"),
+        )
+
+        config["func_path"] = click.prompt(
+            "Please enter a the path to your run_specific_agent function implementation",
+            default="/benchmarks.py",
+        )
+
+        config["cutoff"] = click.prompt(
+            "Please enter a hard cutoff runtime for your agent",
+            default="60",
         )
 
         with open(config_dir, "w") as f:
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
index d13b763c7..9e26dfeeb 100644
--- a/agbenchmark/tests/regression/regression_tests.json
+++ b/agbenchmark/tests/regression/regression_tests.json
@@ -1,7 +1 @@
-{
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py::TestWriteFile::test_method[challenge_data0]"
-    }
-}
\ No newline at end of file
+{}
\ No newline at end of file
diff --git a/agent/benchmarks.py b/agent/benchmarks.py
new file mode 100644
index 000000000..eb66412c1
--- /dev/null
+++ b/agent/benchmarks.py
@@ -0,0 +1,15 @@
+# import subprocess
+
+
+def run_specific_agent(task, conn):
+    cycle_count = 0
+    while (
+        not conn.poll()
+    ):  # Check if there's a termination signal from the main process
+        response = run_agent(task)  # run the agent and get the response and cycle count
+
+        if response:
+            cycle_count += 1
+
+        # Send response and cycle count back to the main process
+        conn.send((response, cycle_count))
diff --git a/agent/hook.py b/agent/hook.py
deleted file mode 100644
index 6fa534180..000000000
--- a/agent/hook.py
+++ /dev/null
@@ -1,10 +0,0 @@
-async def run_specific_agent(task, conn):
-    while (
-        not conn.poll()
-    ):  # Check if there's a termination signal from the main process
-        response, cycle_count = await run_agent(
-            task
-        )  # run the agent and get the response and cycle count
-
-        # Send response and cycle count back to the main process
-        conn.send((response, cycle_count))
-- 
cgit v1.2.3


From 2062844fa6b0250017ba65712e1a590a5fc28616 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 2 Jul 2023 07:38:30 -0700
Subject: Integrate one challenge to auto gpt (#44)

---
 .github/workflows/autogpt.yml                      | 62 +++++++++++++++++++++
 .gitignore                                         |  4 +-
 .gitmodules                                        |  4 ++
 agbenchmark/agent_interface.py                     | 62 ++++++++++-----------
 agbenchmark/config.json                            |  4 +-
 agbenchmark/conftest.py                            | 21 ++++---
 agbenchmark/start_benchmark.py                     | 64 +++++++++++++---------
 .../basic_abilities/write_file/write_file_test.py  |  7 ++-
 agbenchmark/tests/regression/regression_tests.json |  1 -
 agent/Auto-GPT                                     |  1 +
 agent/mini-agi                                     |  1 -
 regression_tests.json                              |  7 +++
 12 files changed, 164 insertions(+), 74 deletions(-)
 create mode 100644 .github/workflows/autogpt.yml
 create mode 100644 .gitmodules
 delete mode 100644 agbenchmark/tests/regression/regression_tests.json
 create mode 160000 agent/Auto-GPT
 delete mode 160000 agent/mini-agi
 create mode 100644 regression_tests.json

diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml
new file mode 100644
index 000000000..2b1925117
--- /dev/null
+++ b/.github/workflows/autogpt.yml
@@ -0,0 +1,62 @@
+name: Auto-GPT Regression Test
+
+on:
+  workflow_dispatch:
+
+jobs:
+  regression-tests:
+    permissions:
+      pull-requests: write
+      contents: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Set up venv and install Python dependencies
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          poetry install
+
+      - name: Build project
+        run: |
+          source venv/bin/activate
+          poetry build
+          cd agent/Auto-GPT
+          pip install -r requirements.txt
+          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
+          agbenchmark start --reg
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.gitignore b/.gitignore
index 68bc17f9f..c41065ca4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,6 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
+.DS_Store
+```
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..b2dc714c5
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "Auto-GPT"]
+	path = agent/Auto-GPT
+	url = https://github.com/Significant-Gravitas/Auto-GPT.git
+	branch = benchmark-integration
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 2ff2acf30..0961dc0f0 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,9 +1,10 @@
-import os
 import importlib
-import time
-from agbenchmark.mocks.MockManager import MockManager
-from multiprocessing import Process, Pipe
 
+from agbenchmark.mocks.MockManager import MockManager
+import os
+import sys
+import subprocess
+import time
 from dotenv import load_dotenv
 
 load_dotenv()
@@ -26,45 +27,44 @@ def run_agent(task, mock_func, config):
         timeout = config["cutoff"]
         print(f"Running Python function '{config['func_path']}' with timeout {timeout}")
 
-        parent_conn, child_conn = Pipe()
+        # Get the current working directory
+        cwd = os.getcwd()
+
+        # Add current directory to Python's import path
+        sys.path.append(cwd)
+
 
-        # Import the specific agent dynamically
         module_name = config["func_path"].replace("/", ".").rstrip(".py")
         module = importlib.import_module(module_name)
-        run_specific_agent = getattr(module, "run_specific_agent")
 
-        process = Process(target=run_specific_agent, args=(task, child_conn))
-        process.start()
+
+        command = [sys.executable, "benchmarks.py", str(task)]
+        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, cwd=cwd)
+
         start_time = time.time()
+        timeout = config["cutoff"]
 
         while True:
-            if (
-                parent_conn.poll()
-            ):  # Check if there's a new message from the child process
-                response, cycle_count = parent_conn.recv()
-                print(f"Cycle {cycle_count}: {response}")
-
-                if cycle_count >= config["cutoff"]:
-                    print(
-                        f"Cycle count has reached the limit of {config['cutoff']}. Terminating."
-                    )
-                    child_conn.send("terminate")
-                    break
+            output = process.stdout.readline()
+            print(output.strip())
 
-            if time.time() - start_time > timeout:
-                print(
-                    "The Python function has exceeded the time limit and was terminated."
-                )
-                child_conn.send(
-                    "terminate"
-                )  # Send a termination signal to the child process
+            # Check if process has ended
+            if process.poll() is not None:
+                print("The Python function has finished running.")
                 break
 
-            if not process.is_alive():
-                print("The Python function has finished running.")
+            # Check if process has exceeded timeout
+            if time.time() - start_time > timeout:
+                print("The Python function has exceeded the time limit and was terminated.")
+                process.terminate()
                 break
 
-        process.join()
+            # Optional: sleep for a while
+            time.sleep(0.1)
+
+        # Wait for process to terminate, then get return code
+        process.wait()
+
 
 
 ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index d9b42ca42..e1c5f154b 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,5 @@
 {
-  "workspace": "C:\\Users\\silen\\miniagi",
-  "func_path": "agent/benchmarks.py",
+  "workspace": "autogpt/workspace/auto_gpt_workspace",
+  "func_path": "benchmarks.py",
   "cutoff": 60
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 0f1fc7bb2..4284d1ebf 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -1,15 +1,18 @@
 import json
 import os
+from pathlib import Path
+
 import pytest
 import shutil
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
+from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
 
 
 @pytest.fixture(scope="module")
 def config(request):
-    config_file = os.path.abspath("agbenchmark/config.json")
-    print(f"Config file: {config_file}")
-    with open(config_file, "r") as f:
+
+    print(f"Config file: {CONFIG_PATH}")
+    with open(CONFIG_PATH, "r") as f:
         config = json.load(f)
 
     if request.config.getoption("--mock"):
@@ -36,10 +39,7 @@ def workspace(config):
 def pytest_addoption(parser):
     parser.addoption("--mock", action="store_true", default=False)
 
-
-regression_json = "agbenchmark/tests/regression/regression_tests.json"
-
-regression_manager = RegressionManager(regression_json)
+regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
 
 
 # this is to get the challenge_data from every test
@@ -53,13 +53,16 @@ def pytest_runtest_makereport(item, call):
         challenge_data = item.funcargs.get("challenge_data", None)
         difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
         dependencies = challenge_data.dependencies if challenge_data else []
-
+        parts = item.nodeid.split("::")[0].split("/")
+        agbenchmark_index = parts.index("agbenchmark")
+        file_path = "/".join(parts[agbenchmark_index:])
         test_details = {
             "difficulty": difficulty,
             "dependencies": dependencies,
-            "test": item.nodeid,
+            "test": file_path,
         }
 
+
         print("pytest_runtest_makereport", test_details)
         if call.excinfo is None:
             regression_manager.add_test(item.nodeid.split("::")[1], test_details)
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index fe395cd21..28b038e9a 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -7,6 +7,13 @@ from dotenv import load_dotenv, set_key
 
 load_dotenv()
 
+CURRENT_DIRECTORY = Path(__file__).resolve().parent
+
+new_path = CURRENT_DIRECTORY / "config.json"
+
+CONFIG_PATH = str(new_path.resolve())
+
+REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
 
 @click.group()
 def cli():
@@ -15,16 +22,12 @@ def cli():
 
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
-@click.option("--noreg", is_flag=True, help="Skip regression tests")
+@click.option("--reg", is_flag=True, help="Runs only regression tests")
 @click.option("--mock", is_flag=True, help="Run with mock")
-def start(category, noreg, mock):
+def start(category, reg, mock):
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
-    config_file = "agbenchmark/config.json"
-
-    config_dir = os.path.abspath(config_file)
-
     # Check if configuration file exists and is not empty
-    if not os.path.exists(config_dir) or os.stat(config_dir).st_size == 0:
+    if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
         config = {}
 
         config["workspace"] = click.prompt(
@@ -42,11 +45,11 @@ def start(category, noreg, mock):
             default="60",
         )
 
-        with open(config_dir, "w") as f:
+        with open(CONFIG_PATH, "w") as f:
             json.dump(config, f)
     else:
         # If the configuration file exists and is not empty, load it
-        with open(config_dir, "r") as f:
+        with open(CONFIG_PATH, "r") as f:
             config = json.load(f)
 
     set_key(".env", "MOCK_TEST", "True" if mock else "False")
@@ -58,11 +61,9 @@ def start(category, noreg, mock):
     if not os.path.exists(workspace_path):
         os.makedirs(workspace_path, exist_ok=True)
 
-    regression_path = os.path.abspath(
-        "agbenchmark/tests/regression/regression_tests.json"
-    )
-    if not os.path.exists(regression_path):
-        with open(regression_path, "a"):
+
+    if not os.path.exists(REGRESSION_TESTS_PATH):
+        with open(REGRESSION_TESTS_PATH, "a"):
             pass
 
     print("Current configuration:")
@@ -70,31 +71,40 @@ def start(category, noreg, mock):
         print(f"{key}: {value}")
 
     print("Starting benchmark tests...", category)
-    pytest_args = ["agbenchmark", "-vs"]
+    tests_to_run = []
+    pytest_args = ["-vs"]
     if category:
         pytest_args.extend(
             ["-m", category]
-        )  # run categorys that are of a specific marker
-        if noreg:
-            pytest_args.extend(
-                ["-k", "not regression"]
-            )  # run categorys that are of a specific marker but don't include regression categorys
-        print(f"Running {'non-regression' + category if noreg else category} categorys")
+        )
     else:
-        if noreg:
-            print("Running all non-regression categorys")
-            pytest_args.extend(
-                ["-k", "not regression"]
-            )  # run categorys that are not regression categorys
+        if reg:
+            print("Running all regression tests")
+            tests_to_run = get_regression_tests()
         else:
-            print("Running all categorys")  # run all categorys
+            print("Running all categories")
 
     if mock:
         pytest_args.append("--mock")
 
     # Run pytest with the constructed arguments
+    if not tests_to_run:
+        tests_to_run = [str(CURRENT_DIRECTORY)]
+    pytest_args.extend(tests_to_run)
     pytest.main(pytest_args)
 
 
+def get_regression_tests():
+    if not Path(REGRESSION_TESTS_PATH).exists():
+        with open(REGRESSION_TESTS_PATH, 'w') as file:
+            json.dump({}, file)
+
+    with open(REGRESSION_TESTS_PATH, 'r') as file:
+        data = json.load(file)
+
+    regression_tests = [str(CURRENT_DIRECTORY / ".." / value['test']) for key, value in data.items()]
+
+    return regression_tests
+
 if __name__ == "__main__":
     start()
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 306375ddd..8d3eb5404 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import pytest
 from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
@@ -9,10 +11,11 @@ class TestWriteFile(BasicChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "w_file_data.json")
 
-    @pytest.mark.depends(on=[], name="basic_write_file")
     def test_method(self, config):
         self.setup_challenge(config)
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+
+        workspace = Path(os.getcwd()) / config['workspace']
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/tests/regression/regression_tests.json b/agbenchmark/tests/regression/regression_tests.json
deleted file mode 100644
index 9e26dfeeb..000000000
--- a/agbenchmark/tests/regression/regression_tests.json
+++ /dev/null
@@ -1 +0,0 @@
-{}
\ No newline at end of file
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
new file mode 160000
index 000000000..c29ec925f
--- /dev/null
+++ b/agent/Auto-GPT
@@ -0,0 +1 @@
+Subproject commit c29ec925fd9e24f219ef0f2884b08908cd66239b
diff --git a/agent/mini-agi b/agent/mini-agi
deleted file mode 160000
index d2add8f18..000000000
--- a/agent/mini-agi
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d2add8f18caf96934a2d193583720cfc9b89451b
diff --git a/regression_tests.json b/regression_tests.json
new file mode 100644
index 000000000..e3633a2af
--- /dev/null
+++ b/regression_tests.json
@@ -0,0 +1,7 @@
+{
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 838f72097cc82b9e12dead330632b83056c7b3f6 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 2 Jul 2023 13:14:49 -0700
Subject: Add static linters ci (#45)

---
 .flake8                                            |  13 +
 .github/workflows/autogpt.yml                      |  10 +-
 .github/workflows/ci.yml                           |  68 ++++
 .gitmodules                                        |   2 +-
 .python-version                                    |   1 +
 agbenchmark/Challenge.py                           | 124 ------
 agbenchmark/agent_interface.py                     |  29 +-
 agbenchmark/challenge.py                           | 126 ++++++
 agbenchmark/challenges/define_task_types.py        |   6 +-
 agbenchmark/challenges/retrieval/Retrieval.py      |   5 +-
 agbenchmark/challenges/retrieval/r1/r1_test.py     |   6 +-
 agbenchmark/conftest.py                            |  26 +-
 agbenchmark/mocks/MockManager.py                   |  28 --
 agbenchmark/mocks/mock_manager.py                  |  29 ++
 agbenchmark/mocks/tests/basic_mocks.py             |   6 +-
 agbenchmark/mocks/tests/retrieval_mocks.py         |   5 +-
 agbenchmark/start_benchmark.py                     |  31 +-
 .../tests/basic_abilities/BasicChallenge.py        |   9 -
 .../tests/basic_abilities/basic_challenge.py       |   8 +
 .../basic_abilities/read_file/read_file_test.py    |  13 +-
 .../basic_abilities/write_file/write_file_test.py  |  10 +-
 mypy.ini                                           |   5 +
 poetry.lock                                        | 422 ++++++++++++++-------
 pyproject.toml                                     |  28 +-
 24 files changed, 652 insertions(+), 358 deletions(-)
 create mode 100644 .flake8
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .python-version
 delete mode 100644 agbenchmark/Challenge.py
 create mode 100644 agbenchmark/challenge.py
 delete mode 100644 agbenchmark/mocks/MockManager.py
 create mode 100644 agbenchmark/mocks/mock_manager.py
 delete mode 100644 agbenchmark/tests/basic_abilities/BasicChallenge.py
 create mode 100644 agbenchmark/tests/basic_abilities/basic_challenge.py
 create mode 100644 mypy.ini

diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000..cb9c777b5
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,13 @@
+[flake8]
+max-line-length = 88
+select = "E303, W293, W291, W292, E305, E231, E302"
+exclude =
+    .tox,
+    __pycache__,
+    *.pyc,
+    .env
+    venv*/*,
+    .venv/*,
+    reports/*,
+    dist/*,
+    agent/*
diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml
index 2b1925117..e889b4c35 100644
--- a/.github/workflows/autogpt.yml
+++ b/.github/workflows/autogpt.yml
@@ -46,14 +46,14 @@ jobs:
 
       - name: Set up venv and install Python dependencies
         run: |
-          python -m venv venv
-          source venv/bin/activate
-          poetry install
+          poetry install --only main
+          poetry build
+          
 
-      - name: Build project
+      - name: Run regression tests
         run: |
+          python -m venv venv
           source venv/bin/activate
-          poetry build
           cd agent/Auto-GPT
           pip install -r requirements.txt
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 000000000..2d25e4ffe
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,68 @@
+name: Python CI
+
+on:
+  push:
+    branches: [ master, ci-test* ]
+  pull_request:
+    branches: [ stable, master, release-* ]
+
+jobs:
+  lint:
+
+    runs-on: ubuntu-latest
+    env:
+      min-python-version: "3.10"
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+
+      - name: Set up Python ${{ env.min-python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.min-python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Install dependencies
+        run: |
+          poetry install
+
+      - name: Lint with flake8
+        run: poetry run flake8
+
+      - name: Check black formatting
+        run: poetry run  black . --check
+        if: success() || failure()
+
+      - name: Check isort formatting
+        run: poetry run  isort . --check
+        if: success() || failure()
+
+      - name: Check mypy formatting
+        run: poetry run mypy --ignore-missing-imports .
+        if: success() || failure()
+
+      - name: Check for unused imports and pass statements
+        run: |
+          cmd="poetry run autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark"
+          $cmd --check || (echo "You have unused imports or pass statements, please run '${cmd} --in-place'" && exit 1)
+        if: success() || failure()
diff --git a/.gitmodules b/.gitmodules
index b2dc714c5..2e3a86e5f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
-[submodule "Auto-GPT"]
+[submodule "agent/Auto-GPT"]
 	path = agent/Auto-GPT
 	url = https://github.com/Significant-Gravitas/Auto-GPT.git
 	branch = benchmark-integration
diff --git a/.python-version b/.python-version
new file mode 100644
index 000000000..d5cd4cce2
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10.10
diff --git a/agbenchmark/Challenge.py b/agbenchmark/Challenge.py
deleted file mode 100644
index d7a2bdc9b..000000000
--- a/agbenchmark/Challenge.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import os
-import glob
-import pytest
-from abc import ABC, abstractmethod
-from agbenchmark.challenges.define_task_types import Ground
-from agbenchmark.challenges.define_task_types import ChallengeData
-from dotenv import load_dotenv
-
-load_dotenv()
-
-mock_test_str = os.getenv("MOCK_TEST")
-MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
-
-
-class Challenge(ABC):
-    """The parent class to all specific challenges classes.
-    Defines helper methods for running a challenge"""
-
-    @abstractmethod
-    def get_file_path(self) -> str:
-        """This should be implemented by any class which inherits from BasicChallenge"""
-        pass
-
-    @property
-    def data(self) -> ChallengeData:
-        # TODO: make it so that this is cached somewhere to just call self.deserialized_data
-        return ChallengeData.deserialize(self.get_file_path())
-
-    @property
-    def mock(self):
-        return self.data.mock.mock_func if self.data.mock else None
-
-    @property
-    def task(self):
-        return (
-            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
-        )
-
-    @property
-    def dependencies(self) -> list:
-        return self.data.dependencies
-
-    def setup_challenge(self, config):
-        from agbenchmark.agent_interface import run_agent
-
-        run_agent(self.task, self.mock, config)
-
-    @property
-    def name(self) -> str:
-        return self.data.name
-
-    @pytest.mark.parametrize(
-        "challenge_data",
-        [data],
-        indirect=True,
-    )
-    def test_method(self, config):
-        raise NotImplementedError
-
-    @staticmethod
-    def open_file(workspace: str, filename: str):
-        script_dir = os.path.abspath(workspace)
-        workspace_dir = os.path.join(script_dir, filename)
-        with open(workspace_dir, "r") as f:
-            return f.read()
-
-    @staticmethod
-    def open_files(workspace: str, file_patterns: list):
-        script_dir = os.path.abspath(workspace)
-        files_contents = []
-
-        for file_pattern in file_patterns:
-            # Check if it is a file extension
-            if file_pattern.startswith("."):
-                # Find all files with the given extension in the workspace
-                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
-            else:
-                # Otherwise, it is a specific file
-                matching_files = [os.path.join(script_dir, file_pattern)]
-
-            for file_path in matching_files:
-                with open(file_path, "r") as f:
-                    files_contents.append(f.read())
-
-        return files_contents
-
-    @staticmethod
-    def write_to_file(workspace: str, filename: str, content: str):
-        script_dir = os.path.abspath(workspace)
-        print("Writing file at", script_dir)
-        workspace_dir = os.path.join(script_dir, filename)
-
-        # Open the file in write mode.
-        with open(workspace_dir, "w") as f:
-            # Write the content to the file.
-            f.write(content)
-
-    def get_filenames_in_workspace(self, workspace: str):
-        return [
-            filename
-            for filename in os.listdir(workspace)
-            if os.path.isfile(os.path.join(workspace, filename))
-        ]
-
-    def scoring(self, content: str, ground: Ground):
-        if ground.should_contain:
-            for should_contain_word in ground.should_contain:
-                if should_contain_word not in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
-
-        if ground.should_not_contain:
-            for should_not_contain_word in ground.should_not_contain:
-                if should_not_contain_word in content:
-                    return 0.0
-                else:
-                    print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
-                    )
-
-        return 1.0
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 0961dc0f0..bd75f8dbb 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,18 +1,22 @@
 import importlib
-
-from agbenchmark.mocks.MockManager import MockManager
 import os
-import sys
 import subprocess
+import sys
 import time
+from typing import Any, Dict, Optional
+
 from dotenv import load_dotenv
 
+from agbenchmark.mocks.mock_manager import MockManager
+
 load_dotenv()
 
 MOCK_FLAG = os.getenv("MOCK_TEST")
 
 
-def run_agent(task, mock_func, config):
+def run_agent(
+    task: Optional[str], mock_func: Optional[str], config: Dict[str, Any]
+) -> None:
     """Calling to get a response"""
 
     if mock_func == None and MOCK_FLAG == "True":
@@ -33,18 +37,24 @@ def run_agent(task, mock_func, config):
         # Add current directory to Python's import path
         sys.path.append(cwd)
 
-
         module_name = config["func_path"].replace("/", ".").rstrip(".py")
         module = importlib.import_module(module_name)
 
-
         command = [sys.executable, "benchmarks.py", str(task)]
-        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, cwd=cwd)
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True,
+            cwd=cwd,
+        )
 
         start_time = time.time()
         timeout = config["cutoff"]
 
         while True:
+            if process.stdout is None:
+                continue
             output = process.stdout.readline()
             print(output.strip())
 
@@ -55,7 +65,9 @@ def run_agent(task, mock_func, config):
 
             # Check if process has exceeded timeout
             if time.time() - start_time > timeout:
-                print("The Python function has exceeded the time limit and was terminated.")
+                print(
+                    "The Python function has exceeded the time limit and was terminated."
+                )
                 process.terminate()
                 break
 
@@ -66,5 +78,4 @@ def run_agent(task, mock_func, config):
         process.wait()
 
 
-
 ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
new file mode 100644
index 000000000..eaed73a22
--- /dev/null
+++ b/agbenchmark/challenge.py
@@ -0,0 +1,126 @@
+import glob
+import os
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+
+import pytest
+from dotenv import load_dotenv
+
+from agbenchmark.challenges.define_task_types import ChallengeData, Ground
+
+load_dotenv()
+
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
+
+
+class Challenge(ABC):
+    """The parent class to all specific challenges classes.
+    Defines helper methods for running a challenge"""
+
+    @abstractmethod
+    def get_file_path(self) -> str:
+        """This should be implemented by any class which inherits from BasicChallenge"""
+        pass
+
+    @property
+    def data(self) -> ChallengeData:
+        # TODO: make it so that this is cached somewhere to just call self.deserialized_data
+        return ChallengeData.deserialize(self.get_file_path())
+
+    @property
+    def mock(self) -> Optional[str]:
+        return self.data.mock.mock_func if self.data.mock else None
+
+    @property
+    def task(self) -> Optional[str]:
+        return (
+            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
+        )
+
+    @property
+    def dependencies(self) -> list:
+        return self.data.dependencies
+
+    def setup_challenge(self, config: Dict[str, Any]) -> None:
+        from agbenchmark.agent_interface import run_agent
+
+        run_agent(self.task, self.mock, config)
+
+    @property
+    def name(self) -> str:
+        return self.data.name
+
+    @pytest.mark.parametrize(
+        "challenge_data",
+        [data],
+        indirect=True,
+    )
+    def test_method(self, config: Dict[str, Any]) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    def open_file(workspace: str, filename: str) -> str:
+        script_dir = os.path.abspath(workspace)
+        workspace_dir = os.path.join(script_dir, filename)
+        with open(workspace_dir, "r") as f:
+            return f.read()
+
+    @staticmethod
+    def open_files(workspace: str, file_patterns: list) -> List[str]:
+        script_dir = os.path.abspath(workspace)
+        files_contents = []
+
+        for file_pattern in file_patterns:
+            # Check if it is a file extension
+            if file_pattern.startswith("."):
+                # Find all files with the given extension in the workspace
+                matching_files = glob.glob(os.path.join(script_dir, "*" + file_pattern))
+            else:
+                # Otherwise, it is a specific file
+                matching_files = [os.path.join(script_dir, file_pattern)]
+
+            for file_path in matching_files:
+                with open(file_path, "r") as f:
+                    files_contents.append(f.read())
+
+        return files_contents
+
+    @staticmethod
+    def write_to_file(workspace: str, filename: str, content: str) -> None:
+        script_dir = os.path.abspath(workspace)
+        print("Writing file at", script_dir)
+        workspace_dir = os.path.join(script_dir, filename)
+
+        # Open the file in write mode.
+        with open(workspace_dir, "w") as f:
+            # Write the content to the file.
+            f.write(content)
+
+    def get_filenames_in_workspace(self, workspace: str) -> List[str]:
+        return [
+            filename
+            for filename in os.listdir(workspace)
+            if os.path.isfile(os.path.join(workspace, filename))
+        ]
+
+    def scoring(self, content: str, ground: Ground) -> float:
+        if ground.should_contain:
+            for should_contain_word in ground.should_contain:
+                if should_contain_word not in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should exist: {should_contain_word} exists in the content"
+                    )
+
+        if ground.should_not_contain:
+            for should_not_contain_word in ground.should_not_contain:
+                if should_not_contain_word in content:
+                    return 0.0
+                else:
+                    print(
+                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                    )
+
+        return 1.0
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 7fc2361b5..52df3017b 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -1,7 +1,7 @@
-from pydantic import BaseModel
-from typing import List, Optional
 import json
-import os
+from typing import List, Optional
+
+from pydantic import BaseModel
 
 
 class Mock(BaseModel):
diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
index b8aa81ce3..891cccef7 100644
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ b/agbenchmark/challenges/retrieval/Retrieval.py
@@ -1,9 +1,8 @@
-from agbenchmark.Challenge import Challenge
 import pytest
 
+from agbenchmark.challenge import Challenge
+
 
 @pytest.mark.retrieval
 class RetrievalChallenge(Challenge):
     """Challenge for information-retrieval"""
-
-    pass
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index b679a731d..675ac8bd7 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,5 +1,7 @@
-from agbenchmark.challenges.retrieval.Retrieval import RetrievalChallenge
 import os
+from typing import Any, Dict
+
+from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 
 
 class TestRetrieval1(RetrievalChallenge):
@@ -8,7 +10,7 @@ class TestRetrieval1(RetrievalChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r1_data.json")
 
-    def test_method(self, config):
+    def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
         files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 4284d1ebf..613565fd2 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -1,16 +1,16 @@
 import json
 import os
-from pathlib import Path
+import shutil
+from typing import Any, Dict, Generator, List
 
 import pytest
-import shutil
-from agbenchmark.tests.regression.RegressionManager import RegressionManager
+
 from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
+from agbenchmark.tests.regression.RegressionManager import RegressionManager
 
 
 @pytest.fixture(scope="module")
-def config(request):
-
+def config(request: Any) -> None:
     print(f"Config file: {CONFIG_PATH}")
     with open(CONFIG_PATH, "r") as f:
         config = json.load(f)
@@ -22,7 +22,7 @@ def config(request):
 
 
 @pytest.fixture(scope="module")
-def workspace(config):
+def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
     yield config["workspace"]
     # teardown after test function completes
     for filename in os.listdir(config["workspace"]):
@@ -36,19 +36,20 @@ def workspace(config):
             print(f"Failed to delete {file_path}. Reason: {e}")
 
 
-def pytest_addoption(parser):
+def pytest_addoption(parser: Any) -> None:
     parser.addoption("--mock", action="store_true", default=False)
 
+
 regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
 
 
 # this is to get the challenge_data from every test
 @pytest.fixture(autouse=True)
-def challenge_data(request):
+def challenge_data(request: Any) -> None:
     return request.param
 
 
-def pytest_runtest_makereport(item, call):
+def pytest_runtest_makereport(item: Any, call: Any) -> None:
     if call.when == "call":
         challenge_data = item.funcargs.get("challenge_data", None)
         difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
@@ -62,7 +63,6 @@ def pytest_runtest_makereport(item, call):
             "test": file_path,
         }
 
-
         print("pytest_runtest_makereport", test_details)
         if call.excinfo is None:
             regression_manager.add_test(item.nodeid.split("::")[1], test_details)
@@ -70,7 +70,7 @@ def pytest_runtest_makereport(item, call):
             regression_manager.remove_test(item.nodeid.split("::")[1])
 
 
-def pytest_collection_modifyitems(items):
+def pytest_collection_modifyitems(items: List[Any]) -> None:
     """Called once all test items are collected. Used
     to add regression and depends markers to collected test items."""
     for item in items:
@@ -80,13 +80,13 @@ def pytest_collection_modifyitems(items):
             item.add_marker(pytest.mark.regression)
 
 
-def pytest_sessionfinish():
+def pytest_sessionfinish() -> None:
     """Called at the end of the session to save regression tests"""
     regression_manager.save()
 
 
 # this is so that all tests can inherit from the Challenge class
-def pytest_generate_tests(metafunc):
+def pytest_generate_tests(metafunc: Any) -> None:
     if "challenge_data" in metafunc.fixturenames:
         # Get the instance of the test class
         test_class = metafunc.cls()
diff --git a/agbenchmark/mocks/MockManager.py b/agbenchmark/mocks/MockManager.py
deleted file mode 100644
index f4e7f5f5a..000000000
--- a/agbenchmark/mocks/MockManager.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import sys
-import agbenchmark.mocks.tests.basic_mocks as basic_mocks
-import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
-
-
-class MockManager:
-    def __init__(self, task: str):
-        self.task = task
-        self.workspace = "agbenchmark/mocks/workspace"
-        self.modules = [basic_mocks, retrieval_mocks]
-
-    def delegate(self, mock_function_name, *args, **kwargs):
-        if hasattr(self, mock_function_name):
-            # Check if the mock function is an attribute of this class
-            getattr(self, mock_function_name)(*args, **kwargs)
-        elif mock_function_name in globals():
-            # Check if the function is imported in the file
-            func = globals()[mock_function_name]
-            func(self.task, self.workspace, *args, **kwargs)
-        elif len(self.modules) > 0:
-            # checks if function is in imported modules
-            for module in self.modules:
-                if hasattr(module, mock_function_name):
-                    func = getattr(module, mock_function_name)
-                    func(self.task, self.workspace, *args, **kwargs)
-                    return
-        else:
-            raise ValueError(f"No such mock: {mock_function_name}")
diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py
new file mode 100644
index 000000000..59fa8dbf1
--- /dev/null
+++ b/agbenchmark/mocks/mock_manager.py
@@ -0,0 +1,29 @@
+from typing import Any
+
+import agbenchmark.mocks.tests.basic_mocks as basic_mocks
+import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
+
+
+class MockManager:
+    def __init__(self, task: str):
+        self.task = task
+        self.workspace = "agbenchmark/mocks/workspace"
+        self.modules = [basic_mocks, retrieval_mocks]
+
+    def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
+        if hasattr(self, mock_function_name):
+            # Check if the mock function is an attribute of this class
+            getattr(self, mock_function_name)(*args, **kwargs)
+        elif mock_function_name in globals():
+            # Check if the function is imported in the file
+            func = globals()[mock_function_name]
+            func(self.task, self.workspace, *args, **kwargs)
+        elif len(self.modules) > 0:
+            # checks if function is in imported modules
+            for module in self.modules:
+                if hasattr(module, mock_function_name):
+                    func = getattr(module, mock_function_name)
+                    func(self.task, self.workspace, *args, **kwargs)
+                    return
+        else:
+            raise ValueError(f"No such mock: {mock_function_name}")
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 631b30c2c..c79a8e2dd 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,7 +1,7 @@
-from agbenchmark.Challenge import Challenge
+from agbenchmark.challenge import Challenge
 
 
-def basic_read_file_mock(task: str, workspace: str):
+def basic_read_file_mock(task: str, workspace: str) -> None:
     """
     This mock reads a file and returns its content.
     """
@@ -13,7 +13,7 @@ def basic_read_file_mock(task: str, workspace: str):
     )
 
 
-def basic_write_file_mock(task: str, workspace: str):
+def basic_write_file_mock(task: str, workspace: str) -> None:
     """
     This mock writes to a file (creates one if it doesn't exist)
     """
diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py
index 2481de060..9a8a57db4 100644
--- a/agbenchmark/mocks/tests/retrieval_mocks.py
+++ b/agbenchmark/mocks/tests/retrieval_mocks.py
@@ -1,8 +1,5 @@
-from agbenchmark.Challenge import Challenge
-
-
 # TODO: Make it so that you can specify for tests to only run if their prerequisites are met.
 # Prerequisites here would be writing to a file (basic_abilities test).
 # Should also check if prerequisites exists in regression file
-def retrieval_1_mock(task: str, workspace: str):
+def retrieval_1_mock(task: str, workspace: str) -> None:
     pass
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 28b038e9a..13e1af231 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -1,8 +1,11 @@
-import click
-import pytest
 import json
 import os
+import sys
 from pathlib import Path
+from typing import List
+
+import click
+import pytest
 from dotenv import load_dotenv, set_key
 
 load_dotenv()
@@ -15,8 +18,9 @@ CONFIG_PATH = str(new_path.resolve())
 
 REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
 
+
 @click.group()
-def cli():
+def cli() -> None:
     pass
 
 
@@ -24,7 +28,7 @@ def cli():
 @click.option("--category", default=None, help="Specific category to run")
 @click.option("--reg", is_flag=True, help="Runs only regression tests")
 @click.option("--mock", is_flag=True, help="Run with mock")
-def start(category, reg, mock):
+def start(category: str, reg: bool, mock: bool) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
@@ -61,7 +65,6 @@ def start(category, reg, mock):
     if not os.path.exists(workspace_path):
         os.makedirs(workspace_path, exist_ok=True)
 
-
     if not os.path.exists(REGRESSION_TESTS_PATH):
         with open(REGRESSION_TESTS_PATH, "a"):
             pass
@@ -74,9 +77,7 @@ def start(category, reg, mock):
     tests_to_run = []
     pytest_args = ["-vs"]
     if category:
-        pytest_args.extend(
-            ["-m", category]
-        )
+        pytest_args.extend(["-m", category])
     else:
         if reg:
             print("Running all regression tests")
@@ -91,20 +92,24 @@ def start(category, reg, mock):
     if not tests_to_run:
         tests_to_run = [str(CURRENT_DIRECTORY)]
     pytest_args.extend(tests_to_run)
-    pytest.main(pytest_args)
 
+    return sys.exit(pytest.main(pytest_args))
 
-def get_regression_tests():
+
+def get_regression_tests() -> List[str]:
     if not Path(REGRESSION_TESTS_PATH).exists():
-        with open(REGRESSION_TESTS_PATH, 'w') as file:
+        with open(REGRESSION_TESTS_PATH, "w") as file:
             json.dump({}, file)
 
-    with open(REGRESSION_TESTS_PATH, 'r') as file:
+    with open(REGRESSION_TESTS_PATH, "r") as file:
         data = json.load(file)
 
-    regression_tests = [str(CURRENT_DIRECTORY / ".." / value['test']) for key, value in data.items()]
+    regression_tests = [
+        str(CURRENT_DIRECTORY / ".." / value["test"]) for key, value in data.items()
+    ]
 
     return regression_tests
 
+
 if __name__ == "__main__":
     start()
diff --git a/agbenchmark/tests/basic_abilities/BasicChallenge.py b/agbenchmark/tests/basic_abilities/BasicChallenge.py
deleted file mode 100644
index 6e7f73100..000000000
--- a/agbenchmark/tests/basic_abilities/BasicChallenge.py
+++ /dev/null
@@ -1,9 +0,0 @@
-import pytest
-from agbenchmark.Challenge import Challenge
-from agbenchmark.challenges.define_task_types import ChallengeData
-from abc import abstractmethod
-
-
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
diff --git a/agbenchmark/tests/basic_abilities/basic_challenge.py b/agbenchmark/tests/basic_abilities/basic_challenge.py
new file mode 100644
index 000000000..8b3a4db1d
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/basic_challenge.py
@@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index c0aaa7f93..c5f886d52 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,14 +1,17 @@
-import pytest
-from agbenchmark.Challenge import Challenge
-from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
 import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenge import Challenge
+from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
 
 
 class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
     @pytest.fixture(scope="module", autouse=True)
-    def setup_module(self, workspace):
+    def setup_module(self, workspace: str) -> None:
         Challenge.write_to_file(
             workspace, self.data.ground.files[0], "this is how we're doing"
         )
@@ -17,7 +20,7 @@ class TestReadFile(BasicChallenge):
         return os.path.join(os.path.dirname(__file__), "r_file_data.json")
 
     @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
-    def test_method(self, config):
+    def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
         files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 8d3eb5404..05db09657 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,8 +1,8 @@
+import os
 from pathlib import Path
+from typing import Any, Dict
 
-import pytest
-from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
-import os
+from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
 
 
 class TestWriteFile(BasicChallenge):
@@ -11,10 +11,10 @@ class TestWriteFile(BasicChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "w_file_data.json")
 
-    def test_method(self, config):
+    def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        workspace = Path(os.getcwd()) / config['workspace']
+        workspace = Path(os.getcwd()) / config["workspace"]
         files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 000000000..315ecae56
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,5 @@
+[mypy]
+follow_imports = skip
+check_untyped_defs = True
+disallow_untyped_defs = True
+exclude = ^(agent/.*\.py)$
diff --git a/poetry.lock b/poetry.lock
index a460f988d..e05fc6c04 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,9 +1,10 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
 version = "3.8.4"
 description = "Async http client/server framework (asyncio)"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -112,6 +113,7 @@ speedups = ["Brotli", "aiodns", "cchardet"]
 name = "aiosignal"
 version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -126,6 +128,7 @@ frozenlist = ">=1.1.0"
 name = "async-timeout"
 version = "4.0.2"
 description = "Timeout context manager for asyncio programs"
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -137,6 +140,7 @@ files = [
 name = "attrs"
 version = "23.1.0"
 description = "Classes Without Boilerplate"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -151,10 +155,74 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
 tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 
+[[package]]
+name = "autoflake"
+version = "1.7.8"
+description = "Removes unused imports and unused variables"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "autoflake-1.7.8-py3-none-any.whl", hash = "sha256:46373ef69b6714f5064c923bb28bd797c4f8a9497f557d87fc36665c6d956b39"},
+    {file = "autoflake-1.7.8.tar.gz", hash = "sha256:e7e46372dee46fa1c97acf310d99d922b63d369718a270809d7c278d34a194cf"},
+]
+
+[package.dependencies]
+pyflakes = ">=1.1.0,<3"
+tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
+
+[[package]]
+name = "black"
+version = "22.3.0"
+description = "The uncompromising code formatter."
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+files = [
+    {file = "black-22.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2497f9c2386572e28921fa8bec7be3e51de6801f7459dffd6e62492531c47e09"},
+    {file = "black-22.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5795a0375eb87bfe902e80e0c8cfaedf8af4d49694d69161e5bd3206c18618bb"},
+    {file = "black-22.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e3556168e2e5c49629f7b0f377070240bd5511e45e25a4497bb0073d9dda776a"},
+    {file = "black-22.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67c8301ec94e3bcc8906740fe071391bce40a862b7be0b86fb5382beefecd968"},
+    {file = "black-22.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:fd57160949179ec517d32ac2ac898b5f20d68ed1a9c977346efbac9c2f1e779d"},
+    {file = "black-22.3.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc1e1de68c8e5444e8f94c3670bb48a2beef0e91dddfd4fcc29595ebd90bb9ce"},
+    {file = "black-22.3.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2fc92002d44746d3e7db7cf9313cf4452f43e9ea77a2c939defce3b10b5c82"},
+    {file = "black-22.3.0-cp36-cp36m-win_amd64.whl", hash = "sha256:a6342964b43a99dbc72f72812bf88cad8f0217ae9acb47c0d4f141a6416d2d7b"},
+    {file = "black-22.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:328efc0cc70ccb23429d6be184a15ce613f676bdfc85e5fe8ea2a9354b4e9015"},
+    {file = "black-22.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06f9d8846f2340dfac80ceb20200ea5d1b3f181dd0556b47af4e8e0b24fa0a6b"},
+    {file = "black-22.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ad4efa5fad66b903b4a5f96d91461d90b9507a812b3c5de657d544215bb7877a"},
+    {file = "black-22.3.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8477ec6bbfe0312c128e74644ac8a02ca06bcdb8982d4ee06f209be28cdf163"},
+    {file = "black-22.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:637a4014c63fbf42a692d22b55d8ad6968a946b4a6ebc385c5505d9625b6a464"},
+    {file = "black-22.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:863714200ada56cbc366dc9ae5291ceb936573155f8bf8e9de92aef51f3ad0f0"},
+    {file = "black-22.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10dbe6e6d2988049b4655b2b739f98785a884d4d6b85bc35133a8fb9a2233176"},
+    {file = "black-22.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:cee3e11161dde1b2a33a904b850b0899e0424cc331b7295f2a9698e79f9a69a0"},
+    {file = "black-22.3.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5891ef8abc06576985de8fa88e95ab70641de6c1fca97e2a15820a9b69e51b20"},
+    {file = "black-22.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:30d78ba6bf080eeaf0b7b875d924b15cd46fec5fd044ddfbad38c8ea9171043a"},
+    {file = "black-22.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee8f1f7228cce7dffc2b464f07ce769f478968bfb3dd1254a4c2eeed84928aad"},
+    {file = "black-22.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ee227b696ca60dd1c507be80a6bc849a5a6ab57ac7352aad1ffec9e8b805f21"},
+    {file = "black-22.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:9b542ced1ec0ceeff5b37d69838106a6348e60db7b8fdd245294dc1d26136265"},
+    {file = "black-22.3.0-py3-none-any.whl", hash = "sha256:bc58025940a896d7e5356952228b68f793cf5fcb342be703c3a2669a1488cb72"},
+    {file = "black-22.3.0.tar.gz", hash = "sha256:35020b8886c022ced9282b51b5a875b6d1ab0c387b31a065b84db7c33085ca79"},
+]
+
+[package.dependencies]
+click = ">=8.0.0"
+mypy-extensions = ">=0.4.3"
+pathspec = ">=0.9.0"
+platformdirs = ">=2"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+colorama = ["colorama (>=0.4.3)"]
+d = ["aiohttp (>=3.7.4)"]
+jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
+uvloop = ["uvloop (>=0.15.2)"]
+
 [[package]]
 name = "certifi"
 version = "2023.5.7"
 description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
 optional = false
 python-versions = ">=3.6"
 files = [
@@ -166,6 +234,7 @@ files = [
 name = "charset-normalizer"
 version = "3.1.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
 optional = false
 python-versions = ">=3.7.0"
 files = [
@@ -250,6 +319,7 @@ files = [
 name = "click"
 version = "8.1.3"
 description = "Composable command line interface toolkit"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -264,6 +334,7 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 name = "colorama"
 version = "0.4.6"
 description = "Cross-platform colored terminal text."
+category = "main"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 files = [
@@ -275,6 +346,7 @@ files = [
 name = "exceptiongroup"
 version = "1.1.1"
 description = "Backport of PEP 654 (exception groups)"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -285,10 +357,28 @@ files = [
 [package.extras]
 test = ["pytest (>=6)"]
 
+[[package]]
+name = "flake8"
+version = "3.9.2"
+description = "the modular source code checker: pep8 pyflakes and co"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+files = [
+    {file = "flake8-3.9.2-py2.py3-none-any.whl", hash = "sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"},
+    {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"},
+]
+
+[package.dependencies]
+mccabe = ">=0.6.0,<0.7.0"
+pycodestyle = ">=2.7.0,<2.8.0"
+pyflakes = ">=2.3.0,<2.4.0"
+
 [[package]]
 name = "frozenlist"
 version = "1.3.3"
 description = "A list-like structure which implements collections.abc.MutableSequence"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -372,6 +462,7 @@ files = [
 name = "future-fstrings"
 version = "1.2.0"
 description = "A backport of fstrings to python<3.6"
+category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
@@ -386,6 +477,7 @@ rewrite = ["tokenize-rt (>=3)"]
 name = "idna"
 version = "3.4"
 description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
 optional = false
 python-versions = ">=3.5"
 files = [
@@ -397,6 +489,7 @@ files = [
 name = "iniconfig"
 version = "2.0.0"
 description = "brain-dead simple config-ini parsing"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -404,10 +497,41 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
+[[package]]
+name = "isort"
+version = "5.12.0"
+description = "A Python utility / library to sort Python imports."
+category = "dev"
+optional = false
+python-versions = ">=3.8.0"
+files = [
+    {file = "isort-5.12.0-py3-none-any.whl", hash = "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"},
+    {file = "isort-5.12.0.tar.gz", hash = "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504"},
+]
+
+[package.extras]
+colors = ["colorama (>=0.4.3)"]
+pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"]
+plugins = ["setuptools"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
+
+[[package]]
+name = "mccabe"
+version = "0.6.1"
+description = "McCabe checker, plugin for flake8"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
+    {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
+]
+
 [[package]]
 name = "multidict"
 version = "6.0.4"
 description = "multidict implementation"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -487,10 +611,64 @@ files = [
     {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
 ]
 
+[[package]]
+name = "mypy"
+version = "0.910"
+description = "Optional static typing for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"},
+    {file = "mypy-0.910-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb"},
+    {file = "mypy-0.910-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9"},
+    {file = "mypy-0.910-cp35-cp35m-win_amd64.whl", hash = "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e"},
+    {file = "mypy-0.910-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921"},
+    {file = "mypy-0.910-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6"},
+    {file = "mypy-0.910-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212"},
+    {file = "mypy-0.910-cp36-cp36m-win_amd64.whl", hash = "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885"},
+    {file = "mypy-0.910-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0"},
+    {file = "mypy-0.910-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de"},
+    {file = "mypy-0.910-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703"},
+    {file = "mypy-0.910-cp37-cp37m-win_amd64.whl", hash = "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a"},
+    {file = "mypy-0.910-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504"},
+    {file = "mypy-0.910-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9"},
+    {file = "mypy-0.910-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072"},
+    {file = "mypy-0.910-cp38-cp38-win_amd64.whl", hash = "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811"},
+    {file = "mypy-0.910-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e"},
+    {file = "mypy-0.910-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b"},
+    {file = "mypy-0.910-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2"},
+    {file = "mypy-0.910-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97"},
+    {file = "mypy-0.910-cp39-cp39-win_amd64.whl", hash = "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8"},
+    {file = "mypy-0.910-py3-none-any.whl", hash = "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"},
+    {file = "mypy-0.910.tar.gz", hash = "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150"},
+]
+
+[package.dependencies]
+mypy-extensions = ">=0.4.3,<0.5.0"
+toml = "*"
+typing-extensions = ">=3.7.4"
+
+[package.extras]
+dmypy = ["psutil (>=4.0)"]
+python2 = ["typed-ast (>=1.4.0,<1.5.0)"]
+
+[[package]]
+name = "mypy-extensions"
+version = "0.4.4"
+description = "Experimental type system extensions for programs checked with the mypy typechecker."
+category = "dev"
+optional = false
+python-versions = ">=2.7"
+files = [
+    {file = "mypy_extensions-0.4.4.tar.gz", hash = "sha256:c8b707883a96efe9b4bb3aaf0dcc07e7e217d7d8368eec4db4049ee9e142f4fd"},
+]
+
 [[package]]
 name = "networkx"
 version = "3.1"
 description = "Python package for creating and manipulating graphs and networks"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -509,6 +687,7 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
 name = "openai"
 version = "0.27.8"
 description = "Python client library for the OpenAI API"
+category = "main"
 optional = false
 python-versions = ">=3.7.1"
 files = [
@@ -523,7 +702,7 @@ tqdm = "*"
 
 [package.extras]
 datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
-dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"]
+dev = ["black (>=21.6b0,<22.0)", "pytest (>=6.0.0,<7.0.0)", "pytest-asyncio", "pytest-mock"]
 embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"]
 wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"]
 
@@ -531,6 +710,7 @@ wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1
 name = "packaging"
 version = "23.1"
 description = "Core utilities for Python packages"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -539,114 +719,105 @@ files = [
 ]
 
 [[package]]
-name = "pexpect"
-version = "4.8.0"
-description = "Pexpect allows easy control of interactive console applications."
+name = "pathspec"
+version = "0.11.1"
+description = "Utility library for gitignore style pattern matching of file paths."
+category = "dev"
 optional = false
-python-versions = "*"
+python-versions = ">=3.7"
 files = [
-    {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"},
-    {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"},
+    {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"},
+    {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"},
 ]
 
-[package.dependencies]
-ptyprocess = ">=0.5"
-
 [[package]]
-name = "pluggy"
-version = "1.0.0"
-description = "plugin and hook calling mechanisms for python"
+name = "platformdirs"
+version = "3.8.0"
+description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
-    {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
+    {file = "platformdirs-3.8.0-py3-none-any.whl", hash = "sha256:ca9ed98ce73076ba72e092b23d3c93ea6c4e186b3f1c3dad6edd98ff6ffcca2e"},
+    {file = "platformdirs-3.8.0.tar.gz", hash = "sha256:b0cabcb11063d21a0b261d557acb0a9d2126350e63b70cdf7db6347baea456dc"},
 ]
 
 [package.extras]
-dev = ["pre-commit", "tox"]
-testing = ["pytest", "pytest-benchmark"]
+docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"]
 
 [[package]]
-name = "psutil"
-version = "5.9.5"
-description = "Cross-platform lib for process and system monitoring in Python."
+name = "pluggy"
+version = "1.2.0"
+description = "plugin and hook calling mechanisms for python"
+category = "main"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+python-versions = ">=3.7"
 files = [
-    {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"},
-    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"},
-    {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"},
-    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ea8518d152174e1249c4f2a1c89e3e6065941df2fa13a1ab45327716a23c2b48"},
-    {file = "psutil-5.9.5-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:acf2aef9391710afded549ff602b5887d7a2349831ae4c26be7c807c0a39fac4"},
-    {file = "psutil-5.9.5-cp27-none-win32.whl", hash = "sha256:5b9b8cb93f507e8dbaf22af6a2fd0ccbe8244bf30b1baad6b3954e935157ae3f"},
-    {file = "psutil-5.9.5-cp27-none-win_amd64.whl", hash = "sha256:8c5f7c5a052d1d567db4ddd231a9d27a74e8e4a9c3f44b1032762bd7b9fdcd42"},
-    {file = "psutil-5.9.5-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3c6f686f4225553615612f6d9bc21f1c0e305f75d7d8454f9b46e901778e7217"},
-    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a7dd9997128a0d928ed4fb2c2d57e5102bb6089027939f3b722f3a210f9a8da"},
-    {file = "psutil-5.9.5-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89518112647f1276b03ca97b65cc7f64ca587b1eb0278383017c2a0dcc26cbe4"},
-    {file = "psutil-5.9.5-cp36-abi3-win32.whl", hash = "sha256:104a5cc0e31baa2bcf67900be36acde157756b9c44017b86b2c049f11957887d"},
-    {file = "psutil-5.9.5-cp36-abi3-win_amd64.whl", hash = "sha256:b258c0c1c9d145a1d5ceffab1134441c4c5113b2417fafff7315a917a026c3c9"},
-    {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"},
-    {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"},
+    {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"},
+    {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
 ]
 
 [package.extras]
-test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+dev = ["pre-commit", "tox"]
+testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
-name = "ptyprocess"
-version = "0.7.0"
-description = "Run a subprocess in a pseudo terminal"
+name = "pycodestyle"
+version = "2.7.0"
+description = "Python style guide checker"
+category = "dev"
 optional = false
-python-versions = "*"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 files = [
-    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
-    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
+    {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"},
+    {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"},
 ]
 
 [[package]]
 name = "pydantic"
-version = "1.10.9"
+version = "1.10.10"
 description = "Data validation and settings management using python type hints"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-1.10.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e692dec4a40bfb40ca530e07805b1208c1de071a18d26af4a2a0d79015b352ca"},
-    {file = "pydantic-1.10.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c52eb595db83e189419bf337b59154bdcca642ee4b2a09e5d7797e41ace783f"},
-    {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939328fd539b8d0edf244327398a667b6b140afd3bf7e347cf9813c736211896"},
-    {file = "pydantic-1.10.9-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b48d3d634bca23b172f47f2335c617d3fcb4b3ba18481c96b7943a4c634f5c8d"},
-    {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:f0b7628fb8efe60fe66fd4adadd7ad2304014770cdc1f4934db41fe46cc8825f"},
-    {file = "pydantic-1.10.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e1aa5c2410769ca28aa9a7841b80d9d9a1c5f223928ca8bec7e7c9a34d26b1d4"},
-    {file = "pydantic-1.10.9-cp310-cp310-win_amd64.whl", hash = "sha256:eec39224b2b2e861259d6f3c8b6290d4e0fbdce147adb797484a42278a1a486f"},
-    {file = "pydantic-1.10.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d111a21bbbfd85c17248130deac02bbd9b5e20b303338e0dbe0faa78330e37e0"},
-    {file = "pydantic-1.10.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2e9aec8627a1a6823fc62fb96480abe3eb10168fd0d859ee3d3b395105ae19a7"},
-    {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07293ab08e7b4d3c9d7de4949a0ea571f11e4557d19ea24dd3ae0c524c0c334d"},
-    {file = "pydantic-1.10.9-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee829b86ce984261d99ff2fd6e88f2230068d96c2a582f29583ed602ef3fc2c"},
-    {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4b466a23009ff5cdd7076eb56aca537c745ca491293cc38e72bf1e0e00de5b91"},
-    {file = "pydantic-1.10.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7847ca62e581e6088d9000f3c497267868ca2fa89432714e21a4fb33a04d52e8"},
-    {file = "pydantic-1.10.9-cp311-cp311-win_amd64.whl", hash = "sha256:7845b31959468bc5b78d7b95ec52fe5be32b55d0d09983a877cca6aedc51068f"},
-    {file = "pydantic-1.10.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:517a681919bf880ce1dac7e5bc0c3af1e58ba118fd774da2ffcd93c5f96eaece"},
-    {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67195274fd27780f15c4c372f4ba9a5c02dad6d50647b917b6a92bf00b3d301a"},
-    {file = "pydantic-1.10.9-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2196c06484da2b3fded1ab6dbe182bdabeb09f6318b7fdc412609ee2b564c49a"},
-    {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:6257bb45ad78abacda13f15bde5886efd6bf549dd71085e64b8dcf9919c38b60"},
-    {file = "pydantic-1.10.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3283b574b01e8dbc982080d8287c968489d25329a463b29a90d4157de4f2baaf"},
-    {file = "pydantic-1.10.9-cp37-cp37m-win_amd64.whl", hash = "sha256:5f8bbaf4013b9a50e8100333cc4e3fa2f81214033e05ac5aa44fa24a98670a29"},
-    {file = "pydantic-1.10.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b9cd67fb763248cbe38f0593cd8611bfe4b8ad82acb3bdf2b0898c23415a1f82"},
-    {file = "pydantic-1.10.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f50e1764ce9353be67267e7fd0da08349397c7db17a562ad036aa7c8f4adfdb6"},
-    {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73ef93e5e1d3c8e83f1ff2e7fdd026d9e063c7e089394869a6e2985696693766"},
-    {file = "pydantic-1.10.9-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128d9453d92e6e81e881dd7e2484e08d8b164da5507f62d06ceecf84bf2e21d3"},
-    {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ad428e92ab68798d9326bb3e5515bc927444a3d71a93b4a2ca02a8a5d795c572"},
-    {file = "pydantic-1.10.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:fab81a92f42d6d525dd47ced310b0c3e10c416bbfae5d59523e63ea22f82b31e"},
-    {file = "pydantic-1.10.9-cp38-cp38-win_amd64.whl", hash = "sha256:963671eda0b6ba6926d8fc759e3e10335e1dc1b71ff2a43ed2efd6996634dafb"},
-    {file = "pydantic-1.10.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:970b1bdc6243ef663ba5c7e36ac9ab1f2bfecb8ad297c9824b542d41a750b298"},
-    {file = "pydantic-1.10.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7e1d5290044f620f80cf1c969c542a5468f3656de47b41aa78100c5baa2b8276"},
-    {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83fcff3c7df7adff880622a98022626f4f6dbce6639a88a15a3ce0f96466cb60"},
-    {file = "pydantic-1.10.9-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0da48717dc9495d3a8f215e0d012599db6b8092db02acac5e0d58a65248ec5bc"},
-    {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0a2aabdc73c2a5960e87c3ffebca6ccde88665616d1fd6d3db3178ef427b267a"},
-    {file = "pydantic-1.10.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9863b9420d99dfa9c064042304868e8ba08e89081428a1c471858aa2af6f57c4"},
-    {file = "pydantic-1.10.9-cp39-cp39-win_amd64.whl", hash = "sha256:e7c9900b43ac14110efa977be3da28931ffc74c27e96ee89fbcaaf0b0fe338e1"},
-    {file = "pydantic-1.10.9-py3-none-any.whl", hash = "sha256:6cafde02f6699ce4ff643417d1a9223716ec25e228ddc3b436fe7e2d25a1f305"},
-    {file = "pydantic-1.10.9.tar.gz", hash = "sha256:95c70da2cd3b6ddf3b9645ecaa8d98f3d80c606624b6d245558d202cd23ea3be"},
+    {file = "pydantic-1.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:adad1ee4ab9888f12dac2529276704e719efcf472e38df7813f5284db699b4ec"},
+    {file = "pydantic-1.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a7db03339893feef2092ff7b1afc9497beed15ebd4af84c3042a74abce02d48"},
+    {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b3714b97ff84b2689654851c2426389bcabfac9080617bcf4306c69db606f6"},
+    {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edfdf0a5abc5c9bf2052ebaec20e67abd52e92d257e4f2d30e02c354ed3e6030"},
+    {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20a3b30fd255eeeb63caa9483502ba96b7795ce5bf895c6a179b3d909d9f53a6"},
+    {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db4c7f7e60ca6f7d6c1785070f3e5771fcb9b2d88546e334d2f2c3934d949028"},
+    {file = "pydantic-1.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:a2d5be50ac4a0976817144c7d653e34df2f9436d15555189f5b6f61161d64183"},
+    {file = "pydantic-1.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:566a04ba755e8f701b074ffb134ddb4d429f75d5dced3fbd829a527aafe74c71"},
+    {file = "pydantic-1.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f79db3652ed743309f116ba863dae0c974a41b688242482638b892246b7db21d"},
+    {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c62376890b819bebe3c717a9ac841a532988372b7e600e76f75c9f7c128219d5"},
+    {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4870f13a4fafd5bc3e93cff3169222534fad867918b188e83ee0496452978437"},
+    {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:990027e77cda6072a566e433b6962ca3b96b4f3ae8bd54748e9d62a58284d9d7"},
+    {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8c40964596809eb616d94f9c7944511f620a1103d63d5510440ed2908fc410af"},
+    {file = "pydantic-1.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:ea9eebc2ebcba3717e77cdeee3f6203ffc0e78db5f7482c68b1293e8cc156e5e"},
+    {file = "pydantic-1.10.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:762aa598f79b4cac2f275d13336b2dd8662febee2a9c450a49a2ab3bec4b385f"},
+    {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dab5219659f95e357d98d70577b361383057fb4414cfdb587014a5f5c595f7b"},
+    {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3d4ee957a727ccb5a36f1b0a6dbd9fad5dedd2a41eada99a8df55c12896e18d"},
+    {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b69f9138dec566962ec65623c9d57bee44412d2fc71065a5f3ebb3820bdeee96"},
+    {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7aa75d1bd9cc275cf9782f50f60cddaf74cbaae19b6ada2a28e737edac420312"},
+    {file = "pydantic-1.10.10-cp37-cp37m-win_amd64.whl", hash = "sha256:9f62a727f5c590c78c2d12fda302d1895141b767c6488fe623098f8792255fe5"},
+    {file = "pydantic-1.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aac218feb4af73db8417ca7518fb3bade4534fcca6e3fb00f84966811dd94450"},
+    {file = "pydantic-1.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:88546dc10a40b5b52cae87d64666787aeb2878f9a9b37825aedc2f362e7ae1da"},
+    {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c41bbaae89e32fc582448e71974de738c055aef5ab474fb25692981a08df808a"},
+    {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b71bd504d1573b0b722ae536e8ffb796bedeef978979d076bf206e77dcc55a5"},
+    {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e088e3865a2270ecbc369924cd7d9fbc565667d9158e7f304e4097ebb9cf98dd"},
+    {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3403a090db45d4027d2344859d86eb797484dfda0706cf87af79ace6a35274ef"},
+    {file = "pydantic-1.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:e0014e29637125f4997c174dd6167407162d7af0da73414a9340461ea8573252"},
+    {file = "pydantic-1.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9965e49c6905840e526e5429b09e4c154355b6ecc0a2f05492eda2928190311d"},
+    {file = "pydantic-1.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:748d10ab6089c5d196e1c8be9de48274f71457b01e59736f7a09c9dc34f51887"},
+    {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86936c383f7c38fd26d35107eb669c85d8f46dfceae873264d9bab46fe1c7dde"},
+    {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a26841be620309a9697f5b1ffc47dce74909e350c5315ccdac7a853484d468a"},
+    {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:409b810f387610cc7405ab2fa6f62bdf7ea485311845a242ebc0bd0496e7e5ac"},
+    {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ce937a2a2c020bcad1c9fde02892392a1123de6dda906ddba62bfe8f3e5989a2"},
+    {file = "pydantic-1.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:37ebddef68370e6f26243acc94de56d291e01227a67b2ace26ea3543cf53dd5f"},
+    {file = "pydantic-1.10.10-py3-none-any.whl", hash = "sha256:a5939ec826f7faec434e2d406ff5e4eaf1716eb1f247d68cd3d0b3612f7b4c8a"},
+    {file = "pydantic-1.10.10.tar.gz", hash = "sha256:3b8d5bd97886f9eb59260594207c9f57dce14a6f869c6ceea90188715d29921a"},
 ]
 
 [package.dependencies]
@@ -656,15 +827,28 @@ typing-extensions = ">=4.2.0"
 dotenv = ["python-dotenv (>=0.10.4)"]
 email = ["email-validator (>=1.0.3)"]
 
+[[package]]
+name = "pyflakes"
+version = "2.3.1"
+description = "passive checker of Python programs"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"},
+    {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"},
+]
+
 [[package]]
 name = "pytest"
-version = "7.3.2"
+version = "7.4.0"
 description = "pytest: simple powerful testing with Python"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.3.2-py3-none-any.whl", hash = "sha256:cdcbd012c9312258922f8cd3f1b62a6580fdced17db6014896053d47cddf9295"},
-    {file = "pytest-7.3.2.tar.gz", hash = "sha256:ee990a3cc55ba808b80795a79944756f315c67c12b56abd3ac993a7b8c17030b"},
+    {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
+    {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
 ]
 
 [package.dependencies]
@@ -682,6 +866,7 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no
 name = "pytest-depends"
 version = "1.0.1"
 description = "Tests that depend on other tests"
+category = "main"
 optional = false
 python-versions = "*"
 files = [
@@ -699,6 +884,7 @@ pytest = ">=3"
 name = "python-dotenv"
 version = "1.0.0"
 description = "Read key-value pairs from a .env file and set them as environment variables"
+category = "main"
 optional = false
 python-versions = ">=3.8"
 files = [
@@ -709,33 +895,11 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]
 
-[[package]]
-name = "pywin32"
-version = "306"
-description = "Python for Window Extensions"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"},
-    {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"},
-    {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"},
-    {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"},
-    {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"},
-    {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"},
-    {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"},
-    {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"},
-    {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"},
-    {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"},
-    {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"},
-    {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"},
-    {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"},
-    {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"},
-]
-
 [[package]]
 name = "requests"
 version = "2.31.0"
 description = "Python HTTP for Humans."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -753,10 +917,23 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
+
 [[package]]
 name = "tomli"
 version = "2.0.1"
 description = "A lil' TOML parser"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -768,6 +945,7 @@ files = [
 name = "tqdm"
 version = "4.65.0"
 description = "Fast, Extensible Progress Meter"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -786,19 +964,21 @@ telegram = ["requests"]
 
 [[package]]
 name = "typing-extensions"
-version = "4.6.3"
+version = "4.7.0"
 description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "typing_extensions-4.6.3-py3-none-any.whl", hash = "sha256:88a4153d8505aabbb4e13aacb7c486c2b4a33ca3b3f807914a9b4c844c471c26"},
-    {file = "typing_extensions-4.6.3.tar.gz", hash = "sha256:d91d5919357fe7f681a9f2b5b4cb2a5f1ef0a1e9f59c4d8ff0d3491e05c0ffd5"},
+    {file = "typing_extensions-4.7.0-py3-none-any.whl", hash = "sha256:5d8c9dac95c27d20df12fb1d97b9793ab8b2af8a3a525e68c80e21060c161771"},
+    {file = "typing_extensions-4.7.0.tar.gz", hash = "sha256:935ccf31549830cda708b42289d44b6f74084d616a00be651601a4f968e77c82"},
 ]
 
 [[package]]
 name = "urllib3"
 version = "2.0.3"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -812,27 +992,11 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
 
-[[package]]
-name = "wexpect"
-version = "4.0.0"
-description = "Windows alternative of pexpect"
-optional = false
-python-versions = "*"
-files = [
-    {file = "wexpect-4.0.0.tar.gz", hash = "sha256:de9e739e78ec4d74a39bf8499904dacb6c594007a674fb7e10752c9b131f6522"},
-]
-
-[package.dependencies]
-psutil = ">=5.0.0"
-pywin32 = ">=220"
-
-[package.extras]
-test = ["codecov", "coverage", "pyinstaller", "setuptools (>=38.0)", "tox", "twine"]
-
 [[package]]
 name = "yarl"
 version = "1.9.2"
 description = "Yet another URL library"
+category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
@@ -918,5 +1082,5 @@ multidict = ">=4.0"
 
 [metadata]
 lock-version = "2.0"
-python-versions = "^3.9"
-content-hash = "8ab722acade739b9fb841ecae3b8cabd4f1d8a355864573a93d9faa11dcffb90"
+python-versions = "^3.10"
+content-hash = "7b5ef821765fd03ae347d42a62be71cb50e97b778544da90a06d35e1808f8ac3"
diff --git a/pyproject.toml b/pyproject.toml
index 043fe68a2..b458f44bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,15 +8,21 @@ readme = "README.md"
 packages = [{include = "agbenchmark"}]
 
 [tool.poetry.dependencies]
-python = "^3.9"
+python = "^3.10"
 pytest = "^7.3.2"
-click = "^8.1.3"
 requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
 python-dotenv = "^1.0.0"
+click = "^8.1.3"
 
+[tool.poetry.group.dev.dependencies]
+flake8 = "^3.9.2"
+mypy = "^0.910"
+isort = "^5.9.3"
+black = "22.3"
+autoflake = "^1.4"
 
 [build-system]
 requires = ["poetry-core"]
@@ -36,3 +42,21 @@ markers = [
 
 [tool.poetry.scripts]
 agbenchmark = "agbenchmark.start_benchmark:cli"
+
+[tool.black]
+line-length = 88
+target-version = ['py310']
+include = '\.pyi?$'
+packages = ["autogpt"]
+extend-exclude = '(/dist|/.venv|/venv|/build|/agent)/'
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+ensure_newline_before_comments = true
+line_length = 88
+sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
+skip_glob = [".tox", "__pycache__", "*.pyc", "venv*/*", "reports", "venv", "env", "node_modules", ".env", ".venv", "dist", "agent/*"]
-- 
cgit v1.2.3


From 07133fb04103776bf639dfb5380d1c7dbb36fb92 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Mon, 3 Jul 2023 11:42:24 -0700
Subject: Run regression tests on push to master and stable (#46)

---
 .github/workflows/autogpt.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml
index e889b4c35..4316f36ff 100644
--- a/.github/workflows/autogpt.yml
+++ b/.github/workflows/autogpt.yml
@@ -2,6 +2,9 @@ name: Auto-GPT Regression Test
 
 on:
   workflow_dispatch:
+    branches: [ master ]
+  push:
+    branches: [ stable, master, ci-test* ]
 
 jobs:
   regression-tests:
-- 
cgit v1.2.3


From 101ffdbce03086b3ef5cd56ef46bff2e58f99783 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Mon, 3 Jul 2023 11:53:28 -0700
Subject: Integrate with gpt engineer (#47)

---
 .github/workflows/gpt-engineer.yml            | 70 +++++++++++++++++++++++++++
 .gitmodules                                   |  4 ++
 agbenchmark/challenges/retrieval/Retrieval.py |  8 ---
 agbenchmark/challenges/retrieval/retrieval.py |  8 +++
 agbenchmark/config.json                       |  5 --
 agbenchmark/start_benchmark.py                |  3 +-
 agent/Auto-GPT                                |  2 +-
 agent/gpt-engineer                            |  1 +
 config.json                                   |  5 ++
 poetry.lock                                   | 17 +++----
 pyproject.toml                                |  2 +-
 11 files changed, 99 insertions(+), 26 deletions(-)
 create mode 100644 .github/workflows/gpt-engineer.yml
 delete mode 100644 agbenchmark/challenges/retrieval/Retrieval.py
 create mode 100644 agbenchmark/challenges/retrieval/retrieval.py
 delete mode 100644 agbenchmark/config.json
 create mode 160000 agent/gpt-engineer
 create mode 100644 config.json

diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml
new file mode 100644
index 000000000..e0dbac2f0
--- /dev/null
+++ b/.github/workflows/gpt-engineer.yml
@@ -0,0 +1,70 @@
+name: gpt-engineer Regression Test
+
+on:
+  workflow_dispatch:
+    branches: [ master ]
+  push:
+    branches: [ stable, master, ci-test* ]
+
+jobs:
+  regression-tests:
+    permissions:
+      pull-requests: write
+      contents: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Set up venv and install Python dependencies
+        run: |
+          poetry install --only main
+          poetry build
+
+      - name: Run regression tests
+        run: |
+          cd agent/gpt-engineer
+          make install
+          source venv/bin/activate
+          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
+          agbenchmark start --reg
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+      - name: Upload logs as artifact
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: gpt-engineer-projects
+          path: agent/gpt-engineer/projects
diff --git a/.gitmodules b/.gitmodules
index 2e3a86e5f..b5b7ba249 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,3 +2,7 @@
 	path = agent/Auto-GPT
 	url = https://github.com/Significant-Gravitas/Auto-GPT.git
 	branch = benchmark-integration
+[submodule "agent/gpt-engineer"]
+	path = agent/gpt-engineer
+	url = https://github.com/merwanehamadi/gpt-engineer.git
+	branch = benchmark-integration
diff --git a/agbenchmark/challenges/retrieval/Retrieval.py b/agbenchmark/challenges/retrieval/Retrieval.py
deleted file mode 100644
index 891cccef7..000000000
--- a/agbenchmark/challenges/retrieval/Retrieval.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.retrieval
-class RetrievalChallenge(Challenge):
-    """Challenge for information-retrieval"""
diff --git a/agbenchmark/challenges/retrieval/retrieval.py b/agbenchmark/challenges/retrieval/retrieval.py
new file mode 100644
index 000000000..891cccef7
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/retrieval.py
@@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.retrieval
+class RetrievalChallenge(Challenge):
+    """Challenge for information-retrieval"""
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
deleted file mode 100644
index e1c5f154b..000000000
--- a/agbenchmark/config.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "workspace": "autogpt/workspace/auto_gpt_workspace",
-  "func_path": "benchmarks.py",
-  "cutoff": 60
-}
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 13e1af231..7489aa309 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -12,9 +12,8 @@ load_dotenv()
 
 CURRENT_DIRECTORY = Path(__file__).resolve().parent
 
-new_path = CURRENT_DIRECTORY / "config.json"
 
-CONFIG_PATH = str(new_path.resolve())
+CONFIG_PATH = str(Path(os.getcwd()) / "config.json")
 
 REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
 
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index c29ec925f..2e5eac51d 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit c29ec925fd9e24f219ef0f2884b08908cd66239b
+Subproject commit 2e5eac51d06d495919d720d370c4d9efd49f4784
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
new file mode 160000
index 000000000..f91ac66b8
--- /dev/null
+++ b/agent/gpt-engineer
@@ -0,0 +1 @@
+Subproject commit f91ac66b8e8210760aaa0047f2ca11c52e55aaa5
diff --git a/config.json b/config.json
new file mode 100644
index 000000000..652618e4b
--- /dev/null
+++ b/config.json
@@ -0,0 +1,5 @@
+{
+  "workspace": "projects/my-new-project/workspace",
+  "func_path": "benchmarks.py",
+  "cutoff": 60
+}
diff --git a/poetry.lock b/poetry.lock
index e05fc6c04..4eae340b6 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -210,7 +210,6 @@ mypy-extensions = ">=0.4.3"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
 
 [package.extras]
 colorama = ["colorama (>=0.4.3)"]
@@ -882,14 +881,14 @@ pytest = ">=3"
 
 [[package]]
 name = "python-dotenv"
-version = "1.0.0"
+version = "0.21.1"
 description = "Read key-value pairs from a .env file and set them as environment variables"
 category = "main"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.7"
 files = [
-    {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
-    {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+    {file = "python-dotenv-0.21.1.tar.gz", hash = "sha256:1c93de8f636cde3ce377292818d0e440b6e45a82f215c3744979151fa8151c49"},
+    {file = "python_dotenv-0.21.1-py3-none-any.whl", hash = "sha256:41e12e0318bebc859fcc4d97d4db8d20ad21721a6aa5047dd59f090391cb549a"},
 ]
 
 [package.extras]
@@ -964,14 +963,14 @@ telegram = ["requests"]
 
 [[package]]
 name = "typing-extensions"
-version = "4.7.0"
+version = "4.7.1"
 description = "Backported and Experimental Type Hints for Python 3.7+"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "typing_extensions-4.7.0-py3-none-any.whl", hash = "sha256:5d8c9dac95c27d20df12fb1d97b9793ab8b2af8a3a525e68c80e21060c161771"},
-    {file = "typing_extensions-4.7.0.tar.gz", hash = "sha256:935ccf31549830cda708b42289d44b6f74084d616a00be651601a4f968e77c82"},
+    {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"},
+    {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
 ]
 
 [[package]]
@@ -1083,4 +1082,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "7b5ef821765fd03ae347d42a62be71cb50e97b778544da90a06d35e1808f8ac3"
+content-hash = "44b5789494e73f3cb8bcb9d25daa62143e59352a246fd7724fdb3ad58c2560ae"
diff --git a/pyproject.toml b/pyproject.toml
index b458f44bd..7e95969af 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ requests = "^2.31.0"
 openai = "^0.27.8"
 pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
-python-dotenv = "^1.0.0"
+python-dotenv = "^0.21.0"
 click = "^8.1.3"
 
 [tool.poetry.group.dev.dependencies]
-- 
cgit v1.2.3


From f183e91ccd1067f3381010687b578554183121b0 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Mon, 3 Jul 2023 17:28:29 -0700
Subject: Integrate smol developer with agbenchmark (#48)

---
 .github/workflows/smol-developer.yml | 64 ++++++++++++++++++++++++++++++++++++
 .gitmodules                          |  4 +++
 agent/smol-developer                 |  1 +
 3 files changed, 69 insertions(+)
 create mode 100644 .github/workflows/smol-developer.yml
 create mode 160000 agent/smol-developer

diff --git a/.github/workflows/smol-developer.yml b/.github/workflows/smol-developer.yml
new file mode 100644
index 000000000..13ee8cf8d
--- /dev/null
+++ b/.github/workflows/smol-developer.yml
@@ -0,0 +1,64 @@
+name: smol developer Regression Test
+
+on:
+  workflow_dispatch:
+    branches: [ master ]
+  push:
+    branches: [ stable, master, ci-test* ]
+
+jobs:
+  regression-tests:
+    permissions:
+      pull-requests: write
+      contents: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Set up venv and install Python dependencies
+        run: |
+          poetry install --only main
+          poetry build
+
+      - name: Run regression tests
+        run: |
+          cd agent/smol-developer
+          python -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
+          agbenchmark start --reg
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.gitmodules b/.gitmodules
index b5b7ba249..b45a16ada 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,7 @@
 	path = agent/gpt-engineer
 	url = https://github.com/merwanehamadi/gpt-engineer.git
 	branch = benchmark-integration
+[submodule "agent/smol-developer"]
+	path = agent/smol-developer
+	url = https://github.com/merwanehamadi/developer.git
+	branch = benchmark-integration
diff --git a/agent/smol-developer b/agent/smol-developer
new file mode 160000
index 000000000..896198af5
--- /dev/null
+++ b/agent/smol-developer
@@ -0,0 +1 @@
+Subproject commit 896198af51dd86dc3cfc2e258c3479948844e283
-- 
cgit v1.2.3


From 5318535d0d26bbd819c135a5f1b8022133c79fcb Mon Sep 17 00:00:00 2001
From: James <james@mercstudio.com>
Date: Tue, 4 Jul 2023 21:28:02 +0800
Subject: Fix summarize_text recursion calls (#4876)

`summarize_text` is currently broken, because it calls itself with the
wrong args (missing `config`)
---
 autogpt/processing/text.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/autogpt/processing/text.py b/autogpt/processing/text.py
index 24851b1c4..cf81fa71d 100644
--- a/autogpt/processing/text.py
+++ b/autogpt/processing/text.py
@@ -131,13 +131,12 @@ def summarize_text(
         logger.info(
             f"Summarizing chunk {i + 1} / {len(chunks)} of length {chunk_length} tokens"
         )
-        summary, _ = summarize_text(chunk, instruction)
+        summary, _ = summarize_text(chunk, config, instruction)
         summaries.append(summary)
 
     logger.info(f"Summarized {len(chunks)} chunks")
 
-    summary, _ = summarize_text("\n\n".join(summaries))
-
+    summary, _ = summarize_text("\n\n".join(summaries), config, instruction)
     return summary.strip(), [
         (summaries[i], chunks[i][0]) for i in range(0, len(chunks))
     ]
-- 
cgit v1.2.3


From 7f098d5fb6e652f78267294da4dfe5296760e031 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 4 Jul 2023 09:13:29 -0700
Subject: Explain how to benchmark new agents (#49)

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 504132ddb..c0f67f153 100644
--- a/README.md
+++ b/README.md
@@ -115,3 +115,14 @@ Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.git
 | | |-- basic_abilities/ **every llm should pass these challenges**
 | | |-- regression/ **challenges that already passed**
 ```
+
+## How to add new agents to agbenchmark ?
+Example with smol developer.
+
+1- Create a github branch with your agent following the same pattern as this example:
+
+https://github.com/smol-ai/developer/pull/114/files
+
+2- Create the submodule and the github workflow by following the same pattern as this example:
+
+https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
-- 
cgit v1.2.3


From e25f6103443b83f017c4d0bd3a7be9c98cf7e83a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 4 Jul 2023 13:23:00 -0400
Subject: local runs, home_path config, submodule miniagi (#50)

---
 .github/workflows/mini-agi.yml                     |  63 ++++++++++
 .gitmodules                                        |   4 +
 README.md                                          | 128 ++-------------------
 agbenchmark/README.md                              | 126 ++++++++++++++++++++
 agbenchmark/agent_interface.py                     |  11 +-
 agbenchmark/start_benchmark.py                     |   2 +-
 .../basic_abilities/write_file/write_file_test.py  |   2 +
 agent/Auto-GPT                                     |   2 +-
 agent/benchmarks.py                                |  15 ---
 agent/benchmarks_example.py                        |  35 ++++++
 agent/config_example.json                          |   6 +
 agent/gpt-engineer                                 |   2 +-
 agent/mini-agi                                     |   1 +
 agent/regression_tests_example.json                |   7 ++
 agent/smol-developer                               |   2 +-
 config.json                                        |   3 +-
 16 files changed, 262 insertions(+), 147 deletions(-)
 create mode 100644 .github/workflows/mini-agi.yml
 create mode 100644 agbenchmark/README.md
 delete mode 100644 agent/benchmarks.py
 create mode 100644 agent/benchmarks_example.py
 create mode 100644 agent/config_example.json
 create mode 160000 agent/mini-agi
 create mode 100644 agent/regression_tests_example.json

diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml
new file mode 100644
index 000000000..92980572a
--- /dev/null
+++ b/.github/workflows/mini-agi.yml
@@ -0,0 +1,63 @@
+name: mini-agi Regression Test
+
+on:
+  workflow_dispatch:
+    branches: [master]
+  push:
+    branches: [stable, master, ci-test*]
+
+jobs:
+  regression-tests:
+    permissions:
+      pull-requests: write
+      contents: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ['3.10']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Set up venv and install Python dependencies
+        run: |
+          poetry install --only main
+          poetry build
+
+      - name: Run regression tests
+        run: |
+          cd agent/mini-agi
+          make install
+          source venv/bin/activate
+          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
+          agbenchmark start --reg
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.gitmodules b/.gitmodules
index b45a16ada..5af445f7a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,6 +6,10 @@
 	path = agent/gpt-engineer
 	url = https://github.com/merwanehamadi/gpt-engineer.git
 	branch = benchmark-integration
+[submodule "agent/mini-agi"]
+	path = agent/mini-agi
+	url = https://github.com/SilenNaihin/mini-agi.git
+	branch = benchmark-integration
 [submodule "agent/smol-developer"]
 	path = agent/smol-developer
 	url = https://github.com/merwanehamadi/developer.git
diff --git a/README.md b/README.md
index c0f67f153..ed348b5ab 100644
--- a/README.md
+++ b/README.md
@@ -2,127 +2,13 @@
 
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
-## As a user
+### Scores:
 
-1. `pip install auto-gpt-benchmarks`
-2. Add boilerplate code to run and kill agent
-3. `agbenchmark start`
-   - `--category challenge_category` to run tests in a specific category
-   - `--mock` to only run mock tests if they exists for each test
-   - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
-4. We call boilerplate code for your agent
-5. Show pass rate of tests, logs, and any other metrics
+Scoring of agents will go here. Both overall and by category.
 
-## Contributing
+### Integrated Agents
 
-##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
-
-### To run the existing mocks
-
-1. clone the repo `auto-gpt-benchmarks`
-2. `pip install poetry`
-3. `poetry shell`
-4. `poetry install`
-5. `cp .env_example .env`
-6. `agbenchmark start --mock`
-   Keep config the same and watch the logs :)
-
-### To run with mini-agi
-
-1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
-2. `pip install -r requirements.txt`
-3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
-4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
-
-- To add requirements `poetry add requirement`.
-
-Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
-
-If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit
-
-Let people know what beautiful code you write does, document everything well
-
-Share your progress :)
-
-### Pytest
-
-an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
-
-```python
-import pytest
-from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
-import os
-
-
-class TestWriteFile(BasicChallenge):
-    """Testing if LLM can write to a file"""
-
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
-
-    @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
-        # implement scoring logic by looking at workspace
-```
-
-All challenges will inherit from parent class which has the mark and any specific methods for their category
-
-```python
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
-```
-
-Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test. 
-```python
-@pytest.fixture(
-        scope="module", autouse=True
-    )  # this is specific to setting up a file for the test, not all tests have this
-    def setup_module(self, workspace):
-        Challenge.write_to_file(
-            workspace, self.data.ground.files[0], "this is how we're doing"
-        )
-```
-
-#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
-
-## Workspace
-
-If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
-
-#### Dataset
-
-Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
-
-## Repo
-
-```
-|-- auto-gpt-benchmarks/ **main project directory**
-| |-- metrics.py **combining scores, metrics, final evaluation**
-| |-- start_benchmark.py **entry point from cli**
-| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
-| |-- Challenge.py **easy challenge creation class**
-| |-- config.json **workspace folder**
-| |-- challenges/ **challenges across different domains**
-| | |-- adaptability/
-| | |-- basic_abilities/
-| | |-- code/
-| | |-- memory/
-| | |-- retrieval/
-| | |-- web_navigation/
-| | |-- writing/
-| |-- tests/
-| | |-- basic_abilities/ **every llm should pass these challenges**
-| | |-- regression/ **challenges that already passed**
-```
-
-## How to add new agents to agbenchmark ?
-Example with smol developer.
-
-1- Create a github branch with your agent following the same pattern as this example:
-
-https://github.com/smol-ai/developer/pull/114/files
-
-2- Create the submodule and the github workflow by following the same pattern as this example:
-
-https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
+- Auto-GPT
+- gpt-engineer
+- mini-agi
+- smol-developer
diff --git a/agbenchmark/README.md b/agbenchmark/README.md
new file mode 100644
index 000000000..a478f83f3
--- /dev/null
+++ b/agbenchmark/README.md
@@ -0,0 +1,126 @@
+## As a user
+
+1. `pip install auto-gpt-benchmarks`
+2. Add boilerplate code to run and kill agent
+3. `agbenchmark start`
+   - `--category challenge_category` to run tests in a specific category
+   - `--mock` to only run mock tests if they exists for each test
+   - `--noreg` to skip any tests that have passed in the past. When you run without this flag and a previous challenge that passed fails, it will now not be regression tests
+4. We call boilerplate code for your agent
+5. Show pass rate of tests, logs, and any other metrics
+
+## Contributing
+
+##### Diagrams: https://whimsical.com/agbenchmark-5n4hXBq1ZGzBwRsK4TVY7x
+
+### To run the existing mocks
+
+1. clone the repo `auto-gpt-benchmarks`
+2. `pip install poetry`
+3. `poetry shell`
+4. `poetry install`
+5. `cp .env_example .env`
+6. `agbenchmark start --mock`
+   Keep config the same and watch the logs :)
+
+### To run with mini-agi
+
+1. Navigate to `auto-gpt-benchmarks/agent/mini-agi`
+2. `pip install -r requirements.txt`
+3. `cp .env_example .env`, set `PROMPT_USER=false` and add your `OPENAI_API_KEY=`. Sset `MODEL="gpt-3.5-turbo"` if you don't have access to `gpt-4` yet. Also make sure you have Python 3.10^ installed
+4. Make sure to follow the commands above, and remove mock flag `agbenchmark start`
+
+- To add requirements `poetry add requirement`.
+
+Feel free to create prs to merge with `main` at will (but also feel free to ask for review) - if you can't send msg in R&D chat for access.
+
+If you push at any point and break things - it'll happen to everyone - fix it asap. Step 1 is to revert `master` to last working commit
+
+Let people know what beautiful code you write does, document everything well
+
+Share your progress :)
+
+### Pytest
+
+an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
+
+```python
+import pytest
+from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
+import os
+
+
+class TestWriteFile(BasicChallenge):
+    """Testing if LLM can write to a file"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
+
+    @pytest.mark.depends(on=[], name="basic_write_file")
+    def test_method(self, workspace):
+        # implement scoring logic by looking at workspace
+```
+
+All challenges will inherit from parent class which has the mark and any specific methods for their category
+
+```python
+@pytest.mark.basic
+class BasicChallenge(Challenge):
+    pass
+```
+
+Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test.
+
+```python
+@pytest.fixture(
+        scope="module", autouse=True
+    )  # this is specific to setting up a file for the test, not all tests have this
+    def setup_module(self, workspace):
+        Challenge.write_to_file(
+            workspace, self.data.ground.files[0], "this is how we're doing"
+        )
+```
+
+#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
+
+## Workspace
+
+If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
+
+#### Dataset
+
+Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
+
+## Repo
+
+```
+|-- auto-gpt-benchmarks/ **main project directory**
+| |-- metrics.py **combining scores, metrics, final evaluation**
+| |-- start_benchmark.py **entry point from cli**
+| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
+| |-- Challenge.py **easy challenge creation class**
+| |-- config.json **workspace folder**
+| |-- challenges/ **challenges across different domains**
+| | |-- adaptability/
+| | |-- basic_abilities/
+| | |-- code/
+| | |-- memory/
+| | |-- retrieval/
+| | |-- web_navigation/
+| | |-- writing/
+| |-- tests/
+| | |-- basic_abilities/ **every llm should pass these challenges**
+| | |-- regression/ **challenges that already passed**
+```
+
+## How to add new agents to agbenchmark ?
+
+Example with smol developer.
+
+1- Create a github branch with your agent following the same pattern as this example:
+
+https://github.com/smol-ai/developer/pull/114/files
+
+2- Create the submodule and the github workflow by following the same pattern as this example:
+
+https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index bd75f8dbb..993aa242a 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,4 +1,3 @@
-import importlib
 import os
 import subprocess
 import sys
@@ -29,18 +28,18 @@ def run_agent(
         mock_manager.delegate(mock_func)
     else:
         timeout = config["cutoff"]
-        print(f"Running Python function '{config['func_path']}' with timeout {timeout}")
+        print(
+            f"Running Python function '{config['entry_path']}' with timeout {timeout}"
+        )
 
         # Get the current working directory
         cwd = os.getcwd()
 
         # Add current directory to Python's import path
         sys.path.append(cwd)
+        sys.path.append(os.path.join(cwd, config["home_path"]))
 
-        module_name = config["func_path"].replace("/", ".").rstrip(".py")
-        module = importlib.import_module(module_name)
-
-        command = [sys.executable, "benchmarks.py", str(task)]
+        command = [sys.executable, config["entry_path"], str(task)]
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 7489aa309..8ef01d3c5 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -38,7 +38,7 @@ def start(category: str, reg: bool, mock: bool) -> int:
             default=os.path.join(Path.home(), "workspace"),
         )
 
-        config["func_path"] = click.prompt(
+        config["entry_path"] = click.prompt(
             "Please enter a the path to your run_specific_agent function implementation",
             default="/benchmarks.py",
         )
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 05db09657..c59e03ccf 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 from typing import Any, Dict
+import pytest
 
 from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
 
@@ -11,6 +12,7 @@ class TestWriteFile(BasicChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "w_file_data.json")
 
+    @pytest.mark.depends(name="basic_write_file")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index 2e5eac51d..dd65cc256 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit 2e5eac51d06d495919d720d370c4d9efd49f4784
+Subproject commit dd65cc256ca72cb199fe8c5d6ae31c23a7acee62
diff --git a/agent/benchmarks.py b/agent/benchmarks.py
deleted file mode 100644
index eb66412c1..000000000
--- a/agent/benchmarks.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# import subprocess
-
-
-def run_specific_agent(task, conn):
-    cycle_count = 0
-    while (
-        not conn.poll()
-    ):  # Check if there's a termination signal from the main process
-        response = run_agent(task)  # run the agent and get the response and cycle count
-
-        if response:
-            cycle_count += 1
-
-        # Send response and cycle count back to the main process
-        conn.send((response, cycle_count))
diff --git a/agent/benchmarks_example.py b/agent/benchmarks_example.py
new file mode 100644
index 000000000..0c35aa9bb
--- /dev/null
+++ b/agent/benchmarks_example.py
@@ -0,0 +1,35 @@
+import os
+import sys
+from typing import Tuple
+import pexpect
+
+
+def run_specific_agent(task: str) -> Tuple[str, int]:
+    # Ensure the directory for the project exists
+    os.makedirs("workspace_path", exist_ok=True)
+
+    # Run the agent command
+    child = pexpect.spawn(f"python example.py {task}")
+
+    # Create a loop to continuously read output
+    while True:
+        try:
+            child.expect("\n")  # This waits until a newline appears
+            print(child.before.decode())  # This prints the line
+        except pexpect.EOF:
+            break  # No more output, break the loop
+
+    # Check the exit status
+    child.close()  # Close the child process
+
+    # Return child process's exit status and any error messages
+    return child.before.decode(), child.exitstatus
+
+
+if __name__ == "__main__":
+    # The first argument is the script name itself, second is the task
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <task>")
+        sys.exit(1)
+    task = sys.argv[1]
+    run_specific_agent(task)
diff --git a/agent/config_example.json b/agent/config_example.json
new file mode 100644
index 000000000..ba2ec0b80
--- /dev/null
+++ b/agent/config_example.json
@@ -0,0 +1,6 @@
+{
+  "workspace": "projects/my-new-project/workspace",
+  "entry_path": "benchmarks.py",
+  "home_path": "",
+  "cutoff": 60
+}
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index f91ac66b8..155ea895e 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit f91ac66b8e8210760aaa0047f2ca11c52e55aaa5
+Subproject commit 155ea895eb5f7e44ed8647b335d90a03b5ffb06d
diff --git a/agent/mini-agi b/agent/mini-agi
new file mode 160000
index 000000000..70bd3f035
--- /dev/null
+++ b/agent/mini-agi
@@ -0,0 +1 @@
+Subproject commit 70bd3f035e7d898221cdb0fc2912d20037fec901
diff --git a/agent/regression_tests_example.json b/agent/regression_tests_example.json
new file mode 100644
index 000000000..a0c76dc55
--- /dev/null
+++ b/agent/regression_tests_example.json
@@ -0,0 +1,7 @@
+{
+  "TestWriteFile": {
+    "difficulty": "basic",
+    "dependencies": [],
+    "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
+  }
+}
diff --git a/agent/smol-developer b/agent/smol-developer
index 896198af5..5a3ad4310 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit 896198af51dd86dc3cfc2e258c3479948844e283
+Subproject commit 5a3ad43103b238b9c8f2a2acceff250888be263e
diff --git a/config.json b/config.json
index 652618e4b..ba2ec0b80 100644
--- a/config.json
+++ b/config.json
@@ -1,5 +1,6 @@
 {
   "workspace": "projects/my-new-project/workspace",
-  "func_path": "benchmarks.py",
+  "entry_path": "benchmarks.py",
+  "home_path": "",
   "cutoff": 60
 }
-- 
cgit v1.2.3


From 73a3e9e42df7caf7d6c65e83898ad9893a829ede Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 4 Jul 2023 13:41:54 -0400
Subject: fixing mini-agi workflow

---
 .github/workflows/mini-agi.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml
index 92980572a..c62df7663 100644
--- a/.github/workflows/mini-agi.yml
+++ b/.github/workflows/mini-agi.yml
@@ -55,8 +55,8 @@ jobs:
       - name: Run regression tests
         run: |
           cd agent/mini-agi
-          make install
-          source venv/bin/activate
+          pip install -r requirements.txt
+          cp .env_example .env
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
           agbenchmark start --reg
         env:
-- 
cgit v1.2.3


From ed9aef5f437abc90c314c2e872623c0a2cb3d933 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 4 Jul 2023 13:51:04 -0400
Subject: adding venv to mini-agi

---
 .github/workflows/mini-agi.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml
index c62df7663..baf9100e8 100644
--- a/.github/workflows/mini-agi.yml
+++ b/.github/workflows/mini-agi.yml
@@ -55,6 +55,8 @@ jobs:
       - name: Run regression tests
         run: |
           cd agent/mini-agi
+          python -m venv venv
+          source venv/bin/activate
           pip install -r requirements.txt
           cp .env_example .env
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-- 
cgit v1.2.3


From 62d37755bc72500f4e057450bc3c30b7d9e1e341 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 4 Jul 2023 14:11:38 -0400
Subject: updating submodule commit

---
 agent/mini-agi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agent/mini-agi b/agent/mini-agi
index 70bd3f035..d99220c05 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 70bd3f035e7d898221cdb0fc2912d20037fec901
+Subproject commit d99220c058c9fb45f83256ea361b55dba506fa75
-- 
cgit v1.2.3


From e6e92e99525ef58ac179254d08fe52d2281c1fcc Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 4 Jul 2023 14:15:04 -0400
Subject: run in continuous

---
 .github/workflows/mini-agi.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml
index baf9100e8..055df304d 100644
--- a/.github/workflows/mini-agi.yml
+++ b/.github/workflows/mini-agi.yml
@@ -63,3 +63,4 @@ jobs:
           agbenchmark start --reg
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          PROMPT_USER: false
-- 
cgit v1.2.3


From f8e550773f49aff8b39750b5a5ac37224d6609ce Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 4 Jul 2023 15:28:00 -0700
Subject: Add retrieval challenge test + run tests on CI pipeline (#51)

---
 .github/workflows/ci.yml                           | 40 ++++++++++++++++++++++
 agbenchmark/challenges/retrieval/r1/r1_data.json   | 18 +++++-----
 agbenchmark/challenges/retrieval/r1/r1_test.py     | 10 ++++--
 agbenchmark/mocks/tests/basic_mocks.py             | 11 ++++++
 agbenchmark/mocks/workspace/file_to_check.txt      |  1 -
 .../basic_abilities/write_file/write_file_test.py  |  1 +
 mypy.ini                                           | 13 +++++++
 regression_tests.json                              | 12 +++++++
 8 files changed, 94 insertions(+), 12 deletions(-)
 delete mode 100644 agbenchmark/mocks/workspace/file_to_check.txt

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2d25e4ffe..6fb348e9b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -66,3 +66,43 @@ jobs:
           cmd="poetry run autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring agbenchmark"
           $cmd --check || (echo "You have unused imports or pass statements, please run '${cmd} --in-place'" && exit 1)
         if: success() || failure()
+
+  tests:
+
+    runs-on: ubuntu-latest
+    env:
+      min-python-version: "3.10"
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+
+      - name: Set up Python ${{ env.min-python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ env.min-python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Install dependencies
+        run: |
+          poetry install
+          poetry run agbenchmark start --mock
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 80c5e51eb..44fce24ea 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -1,21 +1,21 @@
 {
-  "name": "retrieval1",
-  "category": ["basic"],
-  "task": "Print the the capital of America to a .txt file",
+  "name": "basic_information_retrieval",
+  "category": ["retrieval"],
+  "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
   "dependencies": [],
   "ground": {
-    "answer": "Washington",
-    "should_contain": ["Washington"],
-    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "answer": "£25.89",
+    "should_contain": ["25.89"],
+    "should_not_contain": [],
     "files": [".txt"]
   },
   "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
+    "mock_func": "basic_retrieval_mock",
+    "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
   },
   "info": {
     "difficulty": "basic",
-    "description": "Tests the writing to file",
+    "description": "Tests ability to retrieve information from a website.",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
 }
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 675ac8bd7..914784960 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,18 +1,24 @@
 import os
+from pathlib import Path
 from typing import Any, Dict
 
+import pytest
+
 from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 
 
-class TestRetrieval1(RetrievalChallenge):
+class TestRetrieval(RetrievalChallenge):
     """The first information-retrieval challenge"""
 
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r1_data.json")
 
+    @pytest.mark.depends(on=["basic_write_file"])
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+
+        workspace = Path(os.getcwd()) / config["workspace"]
+        files_contents = self.open_files(workspace, self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index c79a8e2dd..07d8a6de0 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -22,3 +22,14 @@ def basic_write_file_mock(task: str, workspace: str) -> None:
         "file_to_check.txt",
         "Washington DC is the capital of the United States of America",
     )
+
+
+def basic_retrieval_mock(task: str, workspace: str) -> None:
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "25.89",
+    )
diff --git a/agbenchmark/mocks/workspace/file_to_check.txt b/agbenchmark/mocks/workspace/file_to_check.txt
deleted file mode 100644
index 48dc8cff1..000000000
--- a/agbenchmark/mocks/workspace/file_to_check.txt
+++ /dev/null
@@ -1 +0,0 @@
-Washington DC is the capital of the United States of America
\ No newline at end of file
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index c59e03ccf..966df7f2d 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 from typing import Any, Dict
+
 import pytest
 
 from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
diff --git a/mypy.ini b/mypy.ini
index 315ecae56..ceb13fcd2 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -3,3 +3,16 @@ follow_imports = skip
 check_untyped_defs = True
 disallow_untyped_defs = True
 exclude = ^(agent/.*\.py)$
+ignore_missing_imports = True
+
+[mypy-agbenchmark.mocks.mock_manager.*]
+ignore_errors = True
+
+[mypy-agbenchmark.tests.basic_abilities.basic_challenge.*]
+ignore_errors = True
+
+[mypy-agbenchmark.mocks.tests.basic_mocks.*]
+ignore_errors = True
+
+[mypy-agbenchmark.tests.regression.RegressionManager.*]
+ignore_errors = True
diff --git a/regression_tests.json b/regression_tests.json
index e3633a2af..9b998d115 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,7 +1,19 @@
 {
+    "TestRetrieval": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
+    },
     "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
+    },
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "basic_write_file"
+        ],
+        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From cef259c945a50c3e3884564c63da4a9a6f2abcd5 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 4 Jul 2023 17:34:55 -0700
Subject: Add pr template (#52)

---
 .github/PULL_REQUEST_TEMPLATE.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..ee5d8bf15
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,15 @@
+### Background
+<!-- Provide a concise overview of the rationale behind this change. Include relevant context, prior discussions, or links to related issues. Ensure that the change aligns with the project's overall direction. -->
+
+### Changes
+<!-- Describe the specific, focused change made in this pull request. Detail the modifications clearly and avoid any unrelated or "extra" changes. -->
+
+
+### PR Quality Checklist
+- [ ] I have run the following commands against my code to ensure it passes our linters:
+    ```shell
+    black .
+    isort .
+    mypy .
+    autoflake --remove-all-unused-imports --recursive --ignore-init-module-imports --ignore-pass-after-docstring --in-place agbenchmark
+    ```
-- 
cgit v1.2.3


From e3c453f10e60f056ea8d8d28849264ab766d9c57 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 4 Jul 2023 17:41:13 -0700
Subject: Add information retrieval 3 (#54)

Co-authored-by: Silen Naihin <silen.naihin@gmail.com>
---
 agbenchmark/challenges/retrieval/r2/r2_data.json | 21 +++++++++++++++++
 agbenchmark/challenges/retrieval/r2/r2_test.py   | 29 ++++++++++++++++++++++++
 agbenchmark/challenges/retrieval/r3/r3_data.json | 21 +++++++++++++++++
 agbenchmark/challenges/retrieval/r3/r3_test.py   | 29 ++++++++++++++++++++++++
 agbenchmark/mocks/tests/basic_mocks.py           | 22 ++++++++++++++++++
 regression_tests.json                            | 10 ++++++++
 6 files changed, 132 insertions(+)
 create mode 100644 agbenchmark/challenges/retrieval/r2/r2_data.json
 create mode 100644 agbenchmark/challenges/retrieval/r2/r2_test.py
 create mode 100644 agbenchmark/challenges/retrieval/r3/r3_data.json
 create mode 100644 agbenchmark/challenges/retrieval/r3/r3_test.py

diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json
new file mode 100644
index 000000000..925e6db83
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2/r2_data.json
@@ -0,0 +1,21 @@
+{
+  "name": "basic_information_retrieval",
+  "category": ["retrieval"],
+  "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+  "dependencies": [],
+  "ground": {
+    "answer": "81,462",
+    "should_contain": ["81,462"],
+    "should_not_contain": [],
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_retrieval_2_mock",
+    "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability to retrieve information.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
new file mode 100644
index 000000000..bdc738868
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -0,0 +1,29 @@
+import os
+from pathlib import Path
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+
+
+class TestRetrieval2(RetrievalChallenge):
+    """The first information-retrieval challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r2_data.json")
+
+    @pytest.mark.depends(on=["basic_write_file"])
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        workspace = Path(os.getcwd()) / config["workspace"]
+        files_contents = self.open_files(workspace, self.data.ground.files)
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json
new file mode 100644
index 000000000..183529c48
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r3/r3_data.json
@@ -0,0 +1,21 @@
+{
+  "name": "basic_information_retrieval",
+  "category": ["retrieval"],
+  "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+  "dependencies": [],
+  "ground": {
+    "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+    "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
+    "should_not_contain": [],
+    "files": [".txt"]
+  },
+  "mock": {
+    "mock_func": "basic_retrieval_3_mock",
+    "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability to retrieve information.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
new file mode 100644
index 000000000..36382b69b
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -0,0 +1,29 @@
+import os
+from pathlib import Path
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
+
+
+class TestRetrieval3(RetrievalChallenge):
+    """The first information-retrieval challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "r3_data.json")
+
+    @pytest.mark.depends(on=["basic_write_file"])
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        workspace = Path(os.getcwd()) / config["workspace"]
+        files_contents = self.open_files(workspace, self.data.ground.files)
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 07d8a6de0..882e3c829 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -33,3 +33,25 @@ def basic_retrieval_mock(task: str, workspace: str) -> None:
         "file_to_check.txt",
         "25.89",
     )
+
+
+def basic_retrieval_2_mock(task: str, workspace: str) -> None:
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "81,462",
+    )
+
+
+def basic_retrieval_3_mock(task: str, workspace: str) -> None:
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+    )
diff --git a/regression_tests.json b/regression_tests.json
index 9b998d115..853c38dcb 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -15,5 +15,15 @@
             "basic_write_file"
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
+    },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+    },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From 351131bbffa2956cde7beacc1a7a95451c895b19 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 4 Jul 2023 18:45:35 -0700
Subject: Change test dependencies (#55)

---
 agbenchmark/challenges/retrieval/r1/r1_test.py | 2 +-
 agbenchmark/challenges/retrieval/r2/r2_test.py | 2 +-
 agbenchmark/challenges/retrieval/r3/r3_test.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 914784960..767775340 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -13,7 +13,7 @@ class TestRetrieval(RetrievalChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r1_data.json")
 
-    @pytest.mark.depends(on=["basic_write_file"])
+    @pytest.mark.depends(on=["basic_write_file"], name="test_retrieval")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
index bdc738868..7664ca36b 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -13,7 +13,7 @@ class TestRetrieval2(RetrievalChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r2_data.json")
 
-    @pytest.mark.depends(on=["basic_write_file"])
+    @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
index 36382b69b..c13de2c86 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -13,7 +13,7 @@ class TestRetrieval3(RetrievalChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r3_data.json")
 
-    @pytest.mark.depends(on=["basic_write_file"])
+    @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-- 
cgit v1.2.3


From bfc7dfdb291099d75dcc1e0dbe3e03439b5163f5 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Tue, 4 Jul 2023 22:06:49 -0400
Subject: Dynamic workspace path (#56)

---
 agbenchmark/conftest.py | 23 ++++++++++++++++++++++-
 agent/mini-agi          |  2 +-
 regression_tests.json   |  2 +-
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 613565fd2..66ede2c08 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -1,6 +1,7 @@
 import json
 import os
 import shutil
+from pathlib import Path  # noqa
 from typing import Any, Dict, Generator, List
 
 import pytest
@@ -9,6 +10,21 @@ from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 
 
+def get_dynamic_workspace(config: Dict[str, Any]) -> str:
+    # Extract the string inside ${...}
+    path_expr = config["workspace"][2:-1]
+
+    # Check if it starts with "os.path.join"
+    if path_expr.strip().startswith("os.path.join"):
+        # Evaluate the path string
+        path_value = eval(path_expr)
+
+        # Replace the original string with the evaluated result
+        return path_value
+    else:
+        raise ValueError("Invalid workspace path expression.")
+
+
 @pytest.fixture(scope="module")
 def config(request: Any) -> None:
     print(f"Config file: {CONFIG_PATH}")
@@ -17,11 +33,16 @@ def config(request: Any) -> None:
 
     if request.config.getoption("--mock"):
         config["workspace"] = "agbenchmark/mocks/workspace"
+    elif config.get("workspace", "").startswith("${") and config.get(
+        "workspace", ""
+    ).endswith("}"):
+        path = get_dynamic_workspace(config)
+        config["workspace"] = path
 
     return config
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="module", autouse=True)
 def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
     yield config["workspace"]
     # teardown after test function completes
diff --git a/agent/mini-agi b/agent/mini-agi
index d99220c05..4af8a7e60 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit d99220c058c9fb45f83256ea361b55dba506fa75
+Subproject commit 4af8a7e6085f0518f06180fbf87024a2c9db4c88
diff --git a/regression_tests.json b/regression_tests.json
index 853c38dcb..d0a8ed19d 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -26,4 +26,4 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
     }
-}
\ No newline at end of file
+}
-- 
cgit v1.2.3


From 74fc969dd60dd40f6b5ee8806ecc80fea50cb7e2 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 5 Jul 2023 20:32:28 -0700
Subject: Add basic memory challenge (#57)

---
 .gitignore                                         |  2 ++
 agbenchmark/agent_interface.py                     |  2 +-
 agbenchmark/challenge.py                           | 20 ++++++++++++++++
 .../memory/m1/artifacts/instructions_1.txt         |  2 ++
 .../memory/m1/artifacts/instructions_2.txt         |  1 +
 .../memory/m1/artifacts/instructions_3.txt         |  1 +
 .../memory/m1/artifacts/instructions_4.txt         |  1 +
 .../memory/m1/artifacts/instructions_5.txt         |  1 +
 agbenchmark/challenges/memory/m1/m1_data.json      | 21 +++++++++++++++++
 agbenchmark/challenges/memory/m1/m1_test.py        | 27 ++++++++++++++++++++++
 agbenchmark/challenges/memory/m1_test.py           |  0
 agbenchmark/challenges/memory/memory.py            |  8 +++++++
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  6 ++---
 agbenchmark/challenges/retrieval/r2/r2_test.py     |  4 +---
 agbenchmark/challenges/retrieval/r3/r3_test.py     |  4 +---
 agbenchmark/conftest.py                            |  7 +++---
 agbenchmark/mocks/mock_manager.py                  |  6 ++---
 agbenchmark/mocks/tests/basic_mocks.py             | 11 +++++++++
 agbenchmark/start_benchmark.py                     |  2 --
 .../read_file/artifacts/file_to_check.txt          |  1 +
 .../basic_abilities/read_file/r_file_data.json     |  4 ++--
 .../basic_abilities/read_file/read_file_test.py    |  7 ------
 .../basic_abilities/write_file/write_file_test.py  |  4 +---
 regression_tests.json                              | 17 +++++++++-----
 24 files changed, 121 insertions(+), 38 deletions(-)
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m1/m1_data.json
 create mode 100644 agbenchmark/challenges/memory/m1/m1_test.py
 delete mode 100644 agbenchmark/challenges/memory/m1_test.py
 create mode 100644 agbenchmark/challenges/memory/memory.py
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt

diff --git a/.gitignore b/.gitignore
index c41065ca4..3581dc933 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+agbenchmark/mocks/workspace/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 993aa242a..4d74aac73 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -22,7 +22,7 @@ def run_agent(
         print("No mock provided")
     elif MOCK_FLAG == "True":
         mock_manager = MockManager(
-            task
+            task, config
         )  # workspace doesn't need to be passed in, stays the same
         print("Server unavailable, using mock", mock_func)
         mock_manager.delegate(mock_func)
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index eaed73a22..865d64444 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -1,5 +1,7 @@
 import glob
+import inspect
 import os
+import shutil
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional
 
@@ -45,6 +47,8 @@ class Challenge(ABC):
     def setup_challenge(self, config: Dict[str, Any]) -> None:
         from agbenchmark.agent_interface import run_agent
 
+        self.copy_artifacts_into_workspace(config["workspace"])
+
         run_agent(self.task, self.mock, config)
 
     @property
@@ -124,3 +128,19 @@ class Challenge(ABC):
                     )
 
         return 1.0
+
+    def copy_artifacts_into_workspace(self, workspace: str) -> None:
+        curr_frame = inspect.currentframe()
+        outer_frame = inspect.getouterframes(curr_frame)[2]
+        caller_file_path = outer_frame.filename
+        caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path))
+        source_dir = os.path.join(caller_dir_path, "artifacts")
+
+        # Check if source_dir exists, if not then return immediately.
+        if not os.path.exists(source_dir):
+            return
+
+        for file_name in os.listdir(source_dir):
+            full_file_name = os.path.join(source_dir, file_name)
+            if os.path.isfile(full_file_name):
+                shutil.copy(full_file_name, workspace)
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
new file mode 100644
index 000000000..da3babb1f
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
@@ -0,0 +1,2 @@
+The id to remember is 2314
+Read the file instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
new file mode 100644
index 000000000..87c870b1a
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
@@ -0,0 +1 @@
+Read the file instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
new file mode 100644
index 000000000..987086208
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
@@ -0,0 +1 @@
+Read the file instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
new file mode 100644
index 000000000..78da710b8
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
@@ -0,0 +1 @@
+Read the file instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
new file mode 100644
index 000000000..8a2d35509
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
@@ -0,0 +1 @@
+Write the id previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json
new file mode 100644
index 000000000..7023f85fc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/m1_data.json
@@ -0,0 +1,21 @@
+{
+  "name": "basic_memory",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["2314"],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"]
+  },
+  "mock": {
+    "mock_func": "basic_memory_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py
new file mode 100644
index 000000000..28e600cc3
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -0,0 +1,27 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.memory.memory import MemoryChallenge
+
+
+class TestBasicMemory(MemoryChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "m1_data.json")
+
+    @pytest.mark.depends(name="test_basic_memory")
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m1_test.py b/agbenchmark/challenges/memory/m1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/memory/memory.py b/agbenchmark/challenges/memory/memory.py
new file mode 100644
index 000000000..429bef23a
--- /dev/null
+++ b/agbenchmark/challenges/memory/memory.py
@@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.memory
+class MemoryChallenge(Challenge):
+    """Challenge for memory"""
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 767775340..d107d9645 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict
 
 import pytest
@@ -13,12 +12,11 @@ class TestRetrieval(RetrievalChallenge):
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r1_data.json")
 
-    @pytest.mark.depends(on=["basic_write_file"], name="test_retrieval")
+    @pytest.mark.depends(name="test_retrieval")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
index 7664ca36b..a60296ecd 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict
 
 import pytest
@@ -17,8 +16,7 @@ class TestRetrieval2(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
index c13de2c86..bcd48d33c 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict
 
 import pytest
@@ -17,8 +16,7 @@ class TestRetrieval3(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 66ede2c08..7203ee6bb 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -31,14 +31,13 @@ def config(request: Any) -> None:
     with open(CONFIG_PATH, "r") as f:
         config = json.load(f)
 
-    if request.config.getoption("--mock"):
-        config["workspace"] = "agbenchmark/mocks/workspace"
-    elif config.get("workspace", "").startswith("${") and config.get(
+    if config.get("workspace", "").startswith("${") and config.get(
         "workspace", ""
     ).endswith("}"):
         path = get_dynamic_workspace(config)
         config["workspace"] = path
-
+    else:
+        config["workspace"] = Path(os.getcwd()) / config["workspace"]
     return config
 
 
diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py
index 59fa8dbf1..5b84965c3 100644
--- a/agbenchmark/mocks/mock_manager.py
+++ b/agbenchmark/mocks/mock_manager.py
@@ -1,13 +1,13 @@
-from typing import Any
+from typing import Any, Dict
 
 import agbenchmark.mocks.tests.basic_mocks as basic_mocks
 import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
 
 
 class MockManager:
-    def __init__(self, task: str):
+    def __init__(self, task: str, config: Dict[str, Any]) -> None:
         self.task = task
-        self.workspace = "agbenchmark/mocks/workspace"
+        self.workspace = config["workspace"]
         self.modules = [basic_mocks, retrieval_mocks]
 
     def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 882e3c829..3b9170f4e 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -55,3 +55,14 @@ def basic_retrieval_3_mock(task: str, workspace: str) -> None:
         "file_to_check.txt",
         "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
     )
+
+
+def basic_memory_mock(task: str, workspace: str) -> None:
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "2314",
+    )
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 8ef01d3c5..959dee361 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -56,8 +56,6 @@ def start(category: str, reg: bool, mock: bool) -> int:
             config = json.load(f)
 
     set_key(".env", "MOCK_TEST", "True" if mock else "False")
-    if mock:
-        config["workspace"] = "agbenchmark/mocks/workspace"
 
     # create workspace directory if it doesn't exist
     workspace_path = os.path.abspath(config["workspace"])
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
new file mode 100644
index 000000000..980a0d5f1
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
@@ -0,0 +1 @@
+Hello World!
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index b21e2724b..a74b875a8 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -4,8 +4,8 @@
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
   "dependencies": ["basic_write_file"],
   "ground": {
-    "answer": "random string: this is how we're doing",
-    "should_contain": ["random string: this is how we're doing"],
+    "answer": "random string: Hello World!",
+    "should_contain": ["random string: Hello World!"],
     "files": ["file_to_check.txt"]
   },
   "mock": {
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index c5f886d52..e7f2af9ec 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -3,19 +3,12 @@ from typing import Any, Dict
 
 import pytest
 
-from agbenchmark.challenge import Challenge
 from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
 
 
 class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
-    @pytest.fixture(scope="module", autouse=True)
-    def setup_module(self, workspace: str) -> None:
-        Challenge.write_to_file(
-            workspace, self.data.ground.files[0], "this is how we're doing"
-        )
-
     def get_file_path(self) -> str:  # all tests must implement this method
         return os.path.join(os.path.dirname(__file__), "r_file_data.json")
 
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 966df7f2d..81f72cc9c 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Any, Dict
 
 import pytest
@@ -17,8 +16,7 @@ class TestWriteFile(BasicChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        workspace = Path(os.getcwd()) / config["workspace"]
-        files_contents = self.open_files(workspace, self.data.ground.files)
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
 
         scores = []
         for file_content in files_contents:
diff --git a/regression_tests.json b/regression_tests.json
index d0a8ed19d..cfa4bda38 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,4 +1,9 @@
 {
+    "TestBasicMemory": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m1/m1_test.py"
+    },
     "TestRetrieval": {
         "difficulty": "basic",
         "dependencies": [],
@@ -9,6 +14,11 @@
         "dependencies": [],
         "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
     },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+    },
     "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
@@ -16,14 +26,9 @@
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
     },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
-    },
     "TestRetrieval3": {
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
     }
-}
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 7102fe1a182f3caed4f056600e9658d14031fe20 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 5 Jul 2023 21:03:45 -0700
Subject: Rename '--reg' flag to '--maintain' (#58)

---
 .github/workflows/ci.yml       | 1 +
 agbenchmark/start_benchmark.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6fb348e9b..6a0f4503a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -106,3 +106,4 @@ jobs:
         run: |
           poetry install
           poetry run agbenchmark start --mock
+          poetry run agbenchmark start --mock --maintain
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 959dee361..9c7b8e8da 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -25,9 +25,9 @@ def cli() -> None:
 
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
-@click.option("--reg", is_flag=True, help="Runs only regression tests")
+@click.option("--maintain", is_flag=True, help="Runs only regression tests")
 @click.option("--mock", is_flag=True, help="Run with mock")
-def start(category: str, reg: bool, mock: bool) -> int:
+def start(category: str, maintain: bool, mock: bool) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
@@ -76,7 +76,7 @@ def start(category: str, reg: bool, mock: bool) -> int:
     if category:
         pytest_args.extend(["-m", category])
     else:
-        if reg:
+        if maintain:
             print("Running all regression tests")
             tests_to_run = get_regression_tests()
         else:
-- 
cgit v1.2.3


From e6f7bcf0ae6d115a8f0a7c35036792ac212ba9f9 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 6 Jul 2023 00:06:34 -0400
Subject: fixing --reg to --maintain workflow bug

---
 .github/workflows/autogpt.yml        |  9 +++--
 .github/workflows/gpt-engineer.yml   |  8 ++---
 .github/workflows/mini-agi.yml       |  2 +-
 .github/workflows/smol-developer.yml |  8 ++---
 .github/workflows/superagi.yml       | 64 ++++++++++++++++++++++++++++++++++++
 agent/SuperAGI                       |  1 +
 6 files changed, 78 insertions(+), 14 deletions(-)
 create mode 100644 .github/workflows/superagi.yml
 create mode 160000 agent/SuperAGI

diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml
index 4316f36ff..2d7e2dfbd 100644
--- a/.github/workflows/autogpt.yml
+++ b/.github/workflows/autogpt.yml
@@ -2,9 +2,9 @@ name: Auto-GPT Regression Test
 
 on:
   workflow_dispatch:
-    branches: [ master ]
+    branches: [master]
   push:
-    branches: [ stable, master, ci-test* ]
+    branches: [stable, master, ci-test*]
 
 jobs:
   regression-tests:
@@ -15,7 +15,7 @@ jobs:
     timeout-minutes: 30
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ['3.10']
 
     steps:
       - name: Checkout repository
@@ -51,7 +51,6 @@ jobs:
         run: |
           poetry install --only main
           poetry build
-          
 
       - name: Run regression tests
         run: |
@@ -60,6 +59,6 @@ jobs:
           cd agent/Auto-GPT
           pip install -r requirements.txt
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --reg
+          agbenchmark start --maintain
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml
index e0dbac2f0..a39165482 100644
--- a/.github/workflows/gpt-engineer.yml
+++ b/.github/workflows/gpt-engineer.yml
@@ -2,9 +2,9 @@ name: gpt-engineer Regression Test
 
 on:
   workflow_dispatch:
-    branches: [ master ]
+    branches: [master]
   push:
-    branches: [ stable, master, ci-test* ]
+    branches: [stable, master, ci-test*]
 
 jobs:
   regression-tests:
@@ -15,7 +15,7 @@ jobs:
     timeout-minutes: 30
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ['3.10']
 
     steps:
       - name: Checkout repository
@@ -58,7 +58,7 @@ jobs:
           make install
           source venv/bin/activate
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --reg
+          agbenchmark start --maintain
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml
index 055df304d..53c479df4 100644
--- a/.github/workflows/mini-agi.yml
+++ b/.github/workflows/mini-agi.yml
@@ -60,7 +60,7 @@ jobs:
           pip install -r requirements.txt
           cp .env_example .env
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --reg
+          agbenchmark start --maintain
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           PROMPT_USER: false
diff --git a/.github/workflows/smol-developer.yml b/.github/workflows/smol-developer.yml
index 13ee8cf8d..6926df54b 100644
--- a/.github/workflows/smol-developer.yml
+++ b/.github/workflows/smol-developer.yml
@@ -2,9 +2,9 @@ name: smol developer Regression Test
 
 on:
   workflow_dispatch:
-    branches: [ master ]
+    branches: [master]
   push:
-    branches: [ stable, master, ci-test* ]
+    branches: [stable, master, ci-test*]
 
 jobs:
   regression-tests:
@@ -15,7 +15,7 @@ jobs:
     timeout-minutes: 30
     strategy:
       matrix:
-        python-version: ["3.10"]
+        python-version: ['3.10']
 
     steps:
       - name: Checkout repository
@@ -59,6 +59,6 @@ jobs:
           source venv/bin/activate
           pip install -r requirements.txt
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --reg
+          agbenchmark start --maintain
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml
new file mode 100644
index 000000000..88176cdc9
--- /dev/null
+++ b/.github/workflows/superagi.yml
@@ -0,0 +1,64 @@
+name: SuperAgi Regression Test
+
+on:
+  workflow_dispatch:
+    branches: [master]
+  push:
+    branches: [stable, master, ci-test*]
+
+jobs:
+  regression-tests:
+    permissions:
+      pull-requests: write
+      contents: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ['3.10']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Set up venv and install Python dependencies
+        run: |
+          poetry install --only main
+          poetry build
+
+      - name: Run regression tests
+        run: |
+          cd agent/SuperAgi
+          cp config_template.yaml config.yaml
+          sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
+          python -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
+          agbenchmark start --maintain
diff --git a/agent/SuperAGI b/agent/SuperAGI
new file mode 160000
index 000000000..166843799
--- /dev/null
+++ b/agent/SuperAGI
@@ -0,0 +1 @@
+Subproject commit 16684379930c770d3eb0ea00bd9f8d2630a1aa99
-- 
cgit v1.2.3


From de44d6ace51a229eff60d6d1965cdd18040e7d4d Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 6 Jul 2023 00:08:49 -0400
Subject: fix

---
 .github/workflows/superagi.yml | 64 ------------------------------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 .github/workflows/superagi.yml

diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml
deleted file mode 100644
index 88176cdc9..000000000
--- a/.github/workflows/superagi.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-name: SuperAgi Regression Test
-
-on:
-  workflow_dispatch:
-    branches: [master]
-  push:
-    branches: [stable, master, ci-test*]
-
-jobs:
-  regression-tests:
-    permissions:
-      pull-requests: write
-      contents: write
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          submodules: true
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Install Poetry
-        run: |
-          curl -sSL https://install.python-poetry.org | python -
-
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Set up venv and install Python dependencies
-        run: |
-          poetry install --only main
-          poetry build
-
-      - name: Run regression tests
-        run: |
-          cd agent/SuperAgi
-          cp config_template.yaml config.yaml
-          sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
-          python -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --maintain
-- 
cgit v1.2.3


From 4ebc5aa3b3b08bfa5710c9f2f8b28d737889c259 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 6 Jul 2023 00:14:40 -0400
Subject: submodule remove

---
 agent/SuperAGI | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 agent/SuperAGI

diff --git a/agent/SuperAGI b/agent/SuperAGI
deleted file mode 160000
index 166843799..000000000
--- a/agent/SuperAGI
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 16684379930c770d3eb0ea00bd9f8d2630a1aa99
-- 
cgit v1.2.3


From 5b19340f8e4cad6537d98b9a4d46e3635c762c1c Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 5 Jul 2023 21:35:15 -0700
Subject: Add 'Remember multiple ids' memory challenge (#59)

---
 .../memory/m2/artifacts/instructions_1.txt         |  1 +
 .../memory/m2/artifacts/instructions_2.txt         |  1 +
 .../memory/m2/artifacts/instructions_3.txt         |  1 +
 .../memory/m2/artifacts/instructions_4.txt         |  1 +
 .../memory/m2/artifacts/instructions_5.txt         |  1 +
 .../memory/m2/remember_multiple_ids_data.json      | 21 +++++++++++++++
 .../memory/m2/remember_multiple_ids_test.py        | 31 ++++++++++++++++++++++
 agbenchmark/mocks/tests/basic_mocks.py             | 11 ++++++++
 regression_tests.json                              |  5 ++++
 9 files changed, 73 insertions(+)
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
 create mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py

diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt
new file mode 100644
index 000000000..d304ce632
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt
@@ -0,0 +1 @@
+The id to remember is 3145. Read the file instructions_2.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt
new file mode 100644
index 000000000..cfff66ba2
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt
@@ -0,0 +1 @@
+The id to remember is 3791. Read the file instructions_3.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt
new file mode 100644
index 000000000..ad16b6fdc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt
@@ -0,0 +1 @@
+The id to remember is 9317. Read the file instructions_4.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt
new file mode 100644
index 000000000..2394d64bc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt
@@ -0,0 +1 @@
+The id to remember is 9471. Read the file instructions_5.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt
new file mode 100644
index 000000000..92a6fba94
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt
@@ -0,0 +1 @@
+Write the ids previously mentioned to a file called file_to_check.txt.
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
new file mode 100644
index 000000000..374df6165
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
@@ -0,0 +1,21 @@
+{
+  "name": "remember_multiple_ids",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "3145\n3791\n9317\n9471",
+    "should_contain": ["3145", "3791", "9317", "9471"],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"]
+  },
+  "mock": {
+    "mock_func": "remember_multiple_ids_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
new file mode 100644
index 000000000..d5f0cf1a9
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@@ -0,0 +1,31 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.memory.memory import MemoryChallenge
+
+
+class TestRememberMultipleIds(MemoryChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(
+            os.path.dirname(__file__), "remember_multiple_ids_data.json"
+        )
+
+    @pytest.mark.depends(
+        name="test_remember_multiple_ids", depends=["test_basic_memory"]
+    )
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 3b9170f4e..1ffb3de39 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -66,3 +66,14 @@ def basic_memory_mock(task: str, workspace: str) -> None:
         "file_to_check.txt",
         "2314",
     )
+
+
+def remember_multiple_ids_mock(task: str, workspace: str) -> None:
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "3145\n3791\n9317\n9471",
+    )
diff --git a/regression_tests.json b/regression_tests.json
index cfa4bda38..9742aa47e 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -30,5 +30,10 @@
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
+    },
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From c76062b0924543e70feb0d6b621cf642c987df51 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 6 Jul 2023 00:38:01 -0400
Subject: Added caching based on file key (#62)

Co-authored-by: merwanehamadi <merwanehamadi@gmail.com>
---
 agbenchmark/challenge.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index 865d64444..dee2b435e 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -20,6 +20,8 @@ class Challenge(ABC):
     """The parent class to all specific challenges classes.
     Defines helper methods for running a challenge"""
 
+    _data_cache: Dict[str, ChallengeData] = {}
+
     @abstractmethod
     def get_file_path(self) -> str:
         """This should be implemented by any class which inherits from BasicChallenge"""
@@ -27,8 +29,13 @@ class Challenge(ABC):
 
     @property
     def data(self) -> ChallengeData:
-        # TODO: make it so that this is cached somewhere to just call self.deserialized_data
-        return ChallengeData.deserialize(self.get_file_path())
+        "Check if the data is already loaded, if not load it"
+        file_path = (
+            self.get_file_path()
+        )  # file_path serves as the key in the cache dictionary
+        if file_path not in Challenge._data_cache:
+            Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
+        return Challenge._data_cache[file_path]
 
     @property
     def mock(self) -> Optional[str]:
-- 
cgit v1.2.3


From 82d8f67f6ab62989469205230ac7a6668e3c7407 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 5 Jul 2023 22:34:51 -0700
Subject: Add 'remember ids with noise' challenge (#61)

---
 .../memory/m3/artifacts/instructions_1.txt         |  5 ++++
 .../memory/m3/artifacts/instructions_2.txt         |  5 ++++
 .../memory/m3/artifacts/instructions_3.txt         |  5 ++++
 .../memory/m3/artifacts/instructions_4.txt         |  5 ++++
 .../memory/m3/artifacts/instructions_5.txt         |  5 ++++
 .../m3/remember_multiple_ids_with_noise_data.json  | 21 ++++++++++++++
 .../m3/remember_multiple_ids_with_noise_test.py    | 32 ++++++++++++++++++++++
 regression_tests.json                              | 25 ++++++++++-------
 8 files changed, 93 insertions(+), 10 deletions(-)
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
 create mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py

diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt
new file mode 100644
index 000000000..8deee1b09
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt
@@ -0,0 +1,5 @@
+xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
+
+The id to remember is 3145. Read the file instructions_2.txt.
+
+OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt
new file mode 100644
index 000000000..c9b5fc5f9
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt
@@ -0,0 +1,5 @@
+2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
+
+The id to remember is 3791. Read the file instructions_3.txt.
+
+BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt
new file mode 100644
index 000000000..a564a7c91
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt
@@ -0,0 +1,5 @@
+ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
+
+The id to remember is 9317. Read the file instructions_4.txt.
+
+8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt
new file mode 100644
index 000000000..63fcd54fa
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt
@@ -0,0 +1,5 @@
+RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
+
+The id to remember is 9471. Read the file instructions_5.txt.
+
+1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt
new file mode 100644
index 000000000..1c3f156e3
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt
@@ -0,0 +1,5 @@
+sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
+
+Write the ids previously mentioned to a file called file_to_check.txt.
+
+gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
new file mode 100644
index 000000000..95c93ccb9
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
@@ -0,0 +1,21 @@
+{
+  "name": "remember_multiple_ids_with_noise_mock",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "3145\n3791\n9317\n9471",
+    "should_contain": ["3145", "3791", "9317", "9471"],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"]
+  },
+  "mock": {
+    "mock_func": "remember_multiple_ids_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "medium",
+    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
new file mode 100644
index 000000000..4d2d64957
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@@ -0,0 +1,32 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.memory.memory import MemoryChallenge
+
+
+class TestRememberMultipleIdsWithNoise(MemoryChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(
+            os.path.dirname(__file__), "remember_multiple_ids_with_noise_data.json"
+        )
+
+    @pytest.mark.depends(
+        name="test_remember_multiple_ids_with_noise",
+        depends=["test_remember_multiple_ids"],
+    )
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/regression_tests.json b/regression_tests.json
index 9742aa47e..3b91a5c90 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -4,11 +4,6 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/memory/m1/m1_test.py"
     },
-    "TestRetrieval": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
-    },
     "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
@@ -19,6 +14,16 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
     },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
+    },
+    "TestRetrieval": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
+    },
     "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
@@ -26,14 +31,14 @@
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
     },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
-    },
     "TestRememberMultipleIds": {
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From 0b4ae5ea78cc10506cfea863ff0cd9bea4f3575e Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Thu, 6 Jul 2023 14:19:12 -0700
Subject: Add 'remember phrases with noise' challenge (#63)

---
 .../memory/m4/artifacts/instructions_1.txt         |  5 ++++
 .../memory/m4/artifacts/instructions_2.txt         |  5 ++++
 .../memory/m4/artifacts/instructions_3.txt         |  5 ++++
 .../memory/m4/artifacts/instructions_4.txt         |  5 ++++
 .../memory/m4/artifacts/instructions_5.txt         |  5 ++++
 .../remember_multiple_phrases_with_noise_data.json | 26 ++++++++++++++++++
 .../remember_multiple_phrases_with_noise_test.py   | 32 ++++++++++++++++++++++
 agbenchmark/mocks/tests/basic_mocks.py             | 11 ++++++++
 regression_tests.json                              |  5 ++++
 9 files changed, 99 insertions(+)
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
 create mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py

diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt
new file mode 100644
index 000000000..1b1e0147c
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt
@@ -0,0 +1,5 @@
+xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
+
+The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt.
+
+OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt
new file mode 100644
index 000000000..92203f5c1
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt
@@ -0,0 +1,5 @@
+2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
+
+The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt.
+
+BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt
new file mode 100644
index 000000000..d7f6f08fc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt
@@ -0,0 +1,5 @@
+ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
+
+The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt.
+
+8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt
new file mode 100644
index 000000000..6c0a13bae
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt
@@ -0,0 +1,5 @@
+RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
+
+The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt.
+
+1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt
new file mode 100644
index 000000000..729f0aa0d
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt
@@ -0,0 +1,5 @@
+sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
+
+Write the phrases previously mentioned to a file called file_to_check.txt.
+
+gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
new file mode 100644
index 000000000..e1ecb16f4
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
@@ -0,0 +1,26 @@
+{
+  "name": "remember_multiple_phrases_with_noise_mock",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+    "should_contain": [
+      "The purple elephant danced on a rainbow while eating a taco",
+      "The sneaky toaster stole my socks and ran away to Hawaii",
+      "My pet rock sings better than Beyoncé on Tuesdays",
+      "The giant hamster rode a unicycle through the crowded mall"
+    ],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"]
+  },
+  "mock": {
+    "mock_func": "remember_multiple_phrases_with_noise_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "medium",
+    "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
new file mode 100644
index 000000000..fd33da1c0
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@@ -0,0 +1,32 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.memory.memory import MemoryChallenge
+
+
+class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(
+            os.path.dirname(__file__), "remember_multiple_phrases_with_noise_data.json"
+        )
+
+    @pytest.mark.depends(
+        name="test_remember_multiple_phrases_with_noise",
+        depends=["test_remember_multiple_ids_with_noise"],
+    )
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 1ffb3de39..37ded0ae9 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -77,3 +77,14 @@ def remember_multiple_ids_mock(task: str, workspace: str) -> None:
         "file_to_check.txt",
         "3145\n3791\n9317\n9471",
     )
+
+
+def remember_multiple_phrases_with_noise_mock(task: str, workspace: str) -> None:
+    """
+    This mock writes to a file (creates one if it doesn't exist)
+    """
+    Challenge.write_to_file(
+        workspace,
+        "file_to_check.txt",
+        "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+    )
diff --git a/regression_tests.json b/regression_tests.json
index 3b91a5c90..1195efbc9 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -40,5 +40,10 @@
         "difficulty": "medium",
         "dependencies": [],
         "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From bfd0d5c826b3854c25b9db1f548315c74592b68d Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Thu, 6 Jul 2023 21:00:45 -0400
Subject: Fix home_path, local mini-agi run works (#64)

Co-authored-by: merwanehamadi <merwanehamadi@gmail.com>
---
 agbenchmark/agent_interface.py | 4 ++--
 agbenchmark/start_benchmark.py | 5 -----
 config.json                    | 4 ++--
 pyproject.toml                 | 1 +
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 4d74aac73..8e9e5a14c 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -33,11 +33,10 @@ def run_agent(
         )
 
         # Get the current working directory
-        cwd = os.getcwd()
+        cwd = os.path.join(os.getcwd(), config["home_path"])
 
         # Add current directory to Python's import path
         sys.path.append(cwd)
-        sys.path.append(os.path.join(cwd, config["home_path"]))
 
         command = [sys.executable, config["entry_path"], str(task)]
         process = subprocess.Popen(
@@ -67,6 +66,7 @@ def run_agent(
                 print(
                     "The Python function has exceeded the time limit and was terminated."
                 )
+                # Terminate the process group
                 process.terminate()
                 break
 
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 9c7b8e8da..295bbf4bf 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -57,11 +57,6 @@ def start(category: str, maintain: bool, mock: bool) -> int:
 
     set_key(".env", "MOCK_TEST", "True" if mock else "False")
 
-    # create workspace directory if it doesn't exist
-    workspace_path = os.path.abspath(config["workspace"])
-    if not os.path.exists(workspace_path):
-        os.makedirs(workspace_path, exist_ok=True)
-
     if not os.path.exists(REGRESSION_TESTS_PATH):
         with open(REGRESSION_TESTS_PATH, "a"):
             pass
diff --git a/config.json b/config.json
index ba2ec0b80..378e69025 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
-  "workspace": "projects/my-new-project/workspace",
+  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
   "entry_path": "benchmarks.py",
-  "home_path": "",
+  "home_path": "agent/mini-agi/",
   "cutoff": 60
 }
diff --git a/pyproject.toml b/pyproject.toml
index 7e95969af..e0d579cab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ markers = [
     "retrieval",
     "regression",
     "basic",
+    "memory"
 ]
 
 [tool.poetry.scripts]
-- 
cgit v1.2.3


From 9ede17891bb4a322d51ec2bf1cc9e60e93db0acd Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 7 Jul 2023 13:50:53 -0700
Subject: Add 'Debug simple typo with guidance' challenge (#65)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 agbenchmark/agent_interface.py                     | 31 +++++++++--
 agbenchmark/challenge.py                           | 62 ++++++++++++----------
 agbenchmark/challenges/README.md                   |  3 +-
 agbenchmark/challenges/code/c1_test.py             |  0
 agbenchmark/challenges/code/code.py                |  8 +++
 .../challenges/code/d1/artifacts_in/__init__.py    |  0
 .../challenges/code/d1/artifacts_in/code.py        | 13 +++++
 .../challenges/code/d1/artifacts_in/test.py        | 31 +++++++++++
 .../challenges/code/d1/artifacts_out/__init__.py   |  0
 .../challenges/code/d1/artifacts_out/code.py       | 12 +++++
 .../challenges/code/d1/artifacts_out/test.py       | 31 +++++++++++
 .../d1/debug_simple_typo_with_guidance_data.json   | 22 ++++++++
 .../d1/debug_simple_typo_with_guidance_test.py     | 31 +++++++++++
 agbenchmark/challenges/define_task_types.py        |  3 +-
 .../memory/m1/artifacts/instructions_1.txt         |  2 -
 .../memory/m1/artifacts/instructions_2.txt         |  1 -
 .../memory/m1/artifacts/instructions_3.txt         |  1 -
 .../memory/m1/artifacts/instructions_4.txt         |  1 -
 .../memory/m1/artifacts/instructions_5.txt         |  1 -
 .../memory/m1/artifacts_in/instructions_1.txt      |  2 +
 .../memory/m1/artifacts_in/instructions_2.txt      |  1 +
 .../memory/m1/artifacts_in/instructions_3.txt      |  1 +
 .../memory/m1/artifacts_in/instructions_4.txt      |  1 +
 .../memory/m1/artifacts_in/instructions_5.txt      |  1 +
 agbenchmark/challenges/memory/m1/m1_data.json      |  3 +-
 agbenchmark/challenges/memory/m1/m1_test.py        |  4 +-
 .../memory/m2/artifacts/instructions_1.txt         |  1 -
 .../memory/m2/artifacts/instructions_2.txt         |  1 -
 .../memory/m2/artifacts/instructions_3.txt         |  1 -
 .../memory/m2/artifacts/instructions_4.txt         |  1 -
 .../memory/m2/artifacts/instructions_5.txt         |  1 -
 .../memory/m2/artifacts_in/instructions_1.txt      |  1 +
 .../memory/m2/artifacts_in/instructions_2.txt      |  1 +
 .../memory/m2/artifacts_in/instructions_3.txt      |  1 +
 .../memory/m2/artifacts_in/instructions_4.txt      |  1 +
 .../memory/m2/artifacts_in/instructions_5.txt      |  1 +
 .../memory/m2/remember_multiple_ids_data.json      |  3 +-
 .../memory/m2/remember_multiple_ids_test.py        |  4 +-
 .../memory/m3/artifacts/instructions_1.txt         |  5 --
 .../memory/m3/artifacts/instructions_2.txt         |  5 --
 .../memory/m3/artifacts/instructions_3.txt         |  5 --
 .../memory/m3/artifacts/instructions_4.txt         |  5 --
 .../memory/m3/artifacts/instructions_5.txt         |  5 --
 .../memory/m3/artifacts_in/instructions_1.txt      |  5 ++
 .../memory/m3/artifacts_in/instructions_2.txt      |  5 ++
 .../memory/m3/artifacts_in/instructions_3.txt      |  5 ++
 .../memory/m3/artifacts_in/instructions_4.txt      |  5 ++
 .../memory/m3/artifacts_in/instructions_5.txt      |  5 ++
 .../m3/remember_multiple_ids_with_noise_data.json  |  3 +-
 .../m3/remember_multiple_ids_with_noise_test.py    |  4 +-
 .../memory/m4/artifacts/instructions_1.txt         |  5 --
 .../memory/m4/artifacts/instructions_2.txt         |  5 --
 .../memory/m4/artifacts/instructions_3.txt         |  5 --
 .../memory/m4/artifacts/instructions_4.txt         |  5 --
 .../memory/m4/artifacts/instructions_5.txt         |  5 --
 .../memory/m4/artifacts_in/instructions_1.txt      |  5 ++
 .../memory/m4/artifacts_in/instructions_2.txt      |  5 ++
 .../memory/m4/artifacts_in/instructions_3.txt      |  5 ++
 .../memory/m4/artifacts_in/instructions_4.txt      |  5 ++
 .../memory/m4/artifacts_in/instructions_5.txt      |  5 ++
 .../remember_multiple_phrases_with_noise_data.json |  3 +-
 .../remember_multiple_phrases_with_noise_test.py   |  4 +-
 agbenchmark/challenges/retrieval/r1/r1_data.json   |  3 +-
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  4 +-
 agbenchmark/challenges/retrieval/r2/r2_data.json   |  3 +-
 agbenchmark/challenges/retrieval/r2/r2_test.py     |  4 +-
 agbenchmark/challenges/retrieval/r3/r3_data.json   |  3 +-
 agbenchmark/challenges/retrieval/r3/r3_test.py     |  4 +-
 agbenchmark/mocks/tests/basic_mocks.py             | 12 -----
 .../read_file/artifacts/file_to_check.txt          |  1 -
 .../read_file/artifacts_in/file_to_check.txt       |  1 +
 .../read_file/artifacts_out/file_to_check.txt      |  1 +
 .../basic_abilities/read_file/r_file_data.json     |  7 +--
 .../basic_abilities/read_file/read_file_test.py    |  4 +-
 .../basic_abilities/write_file/w_file_data.json    |  3 +-
 .../basic_abilities/write_file/write_file_test.py  |  4 +-
 pyproject.toml                                     |  1 +
 regression_tests.json                              | 45 +++++++++-------
 78 files changed, 350 insertions(+), 147 deletions(-)
 delete mode 100644 agbenchmark/challenges/code/c1_test.py
 create mode 100644 agbenchmark/challenges/code/code.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/__init__.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/code.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_in/test.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d1/artifacts_out/test.py
 create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
 create mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 8e9e5a14c..05540f6d3 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import subprocess
 import sys
 import time
@@ -14,13 +15,20 @@ MOCK_FLAG = os.getenv("MOCK_TEST")
 
 
 def run_agent(
-    task: Optional[str], mock_func: Optional[str], config: Dict[str, Any]
+    task: Optional[str],
+    mock_func: Optional[str],
+    config: Dict[str, Any],
+    challenge_location: str,
 ) -> None:
     """Calling to get a response"""
 
-    if mock_func == None and MOCK_FLAG == "True":
-        print("No mock provided")
-    elif MOCK_FLAG == "True":
+    if MOCK_FLAG == "True":
+        copy_artifacts_into_workspace(
+            config["workspace"], "artifacts_out", challenge_location
+        )
+        if mock_func is None:
+            print("No mock provided")
+            return
         mock_manager = MockManager(
             task, config
         )  # workspace doesn't need to be passed in, stays the same
@@ -77,4 +85,19 @@ def run_agent(
         process.wait()
 
 
+def copy_artifacts_into_workspace(
+    workspace: str, artifact_folder_name: str, challenge_dir_path: str
+) -> None:
+    source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
+
+    # Check if source_dir exists, if not then return immediately.
+    if not os.path.exists(source_dir):
+        return
+
+    for file_name in os.listdir(source_dir):
+        full_file_name = os.path.join(source_dir, file_name)
+        if os.path.isfile(full_file_name):
+            shutil.copy(full_file_name, workspace)
+
+
 ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index dee2b435e..4c8e69848 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -1,9 +1,10 @@
 import glob
 import inspect
 import os
-import shutil
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+import subprocess
+import types
+from abc import ABC, ABCMeta, abstractmethod
+from typing import Any, Dict, List, Optional, Tuple, Type, cast
 
 import pytest
 from dotenv import load_dotenv
@@ -16,7 +17,20 @@ mock_test_str = os.getenv("MOCK_TEST")
 MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
 
 
-class Challenge(ABC):
+class ChallengeMeta(ABCMeta):
+    def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
+
+        super().__init__(name, bases, dct)
+        try:
+            frame = cast(types.FrameType, inspect.currentframe())
+            assert frame.f_back is not None
+            self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
+        except Exception as e:
+            print(f"Unable to get the file from 8 frames back due to: {str(e)}")
+            raise e
+
+
+class Challenge(ABC, metaclass=ChallengeMeta):
     """The parent class to all specific challenges classes.
     Defines helper methods for running a challenge"""
 
@@ -52,11 +66,13 @@ class Challenge(ABC):
         return self.data.dependencies
 
     def setup_challenge(self, config: Dict[str, Any]) -> None:
-        from agbenchmark.agent_interface import run_agent
+        from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
 
-        self.copy_artifacts_into_workspace(config["workspace"])
+        copy_artifacts_into_workspace(
+            config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
+        )
 
-        run_agent(self.task, self.mock, config)
+        run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
 
     @property
     def name(self) -> str:
@@ -77,8 +93,7 @@ class Challenge(ABC):
         with open(workspace_dir, "r") as f:
             return f.read()
 
-    @staticmethod
-    def open_files(workspace: str, file_patterns: list) -> List[str]:
+    def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
         script_dir = os.path.abspath(workspace)
         files_contents = []
 
@@ -92,8 +107,17 @@ class Challenge(ABC):
                 matching_files = [os.path.join(script_dir, file_pattern)]
 
             for file_path in matching_files:
-                with open(file_path, "r") as f:
-                    files_contents.append(f.read())
+                if self.data.ground.type == "execute_python_code":
+                    result = subprocess.run(
+                        ["python3", file_path],
+                        cwd=os.path.abspath(workspace),
+                        capture_output=True,
+                        text=True,
+                    )
+                    files_contents.append(result.stdout)
+                else:
+                    with open(file_path, "r") as f:
+                        files_contents.append(f.read())
 
         return files_contents
 
@@ -135,19 +159,3 @@ class Challenge(ABC):
                     )
 
         return 1.0
-
-    def copy_artifacts_into_workspace(self, workspace: str) -> None:
-        curr_frame = inspect.currentframe()
-        outer_frame = inspect.getouterframes(curr_frame)[2]
-        caller_file_path = outer_frame.filename
-        caller_dir_path = os.path.dirname(os.path.abspath(caller_file_path))
-        source_dir = os.path.join(caller_dir_path, "artifacts")
-
-        # Check if source_dir exists, if not then return immediately.
-        if not os.path.exists(source_dir):
-            return
-
-        for file_name in os.listdir(source_dir):
-            full_file_name = os.path.join(source_dir, file_name)
-            if os.path.isfile(full_file_name):
-                shutil.copy(full_file_name, workspace)
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 9e74d19ce..2d782d1fc 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -33,7 +33,8 @@ Example:
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_write_file_mock",
diff --git a/agbenchmark/challenges/code/c1_test.py b/agbenchmark/challenges/code/c1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py
new file mode 100644
index 000000000..508d24a90
--- /dev/null
+++ b/agbenchmark/challenges/code/code.py
@@ -0,0 +1,8 @@
+import pytest
+
+from agbenchmark.challenge import Challenge
+
+
+@pytest.mark.code
+class CodeChallenge(Challenge):
+    """Challenge for memory"""
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1/artifacts_in/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1/artifacts_in/code.py
new file mode 100644
index 000000000..df8120bfa
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_in/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1/artifacts_in/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_in/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1/artifacts_out/code.py
new file mode 100644
index 000000000..de3d8c62c
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1/artifacts_out/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/artifacts_out/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
new file mode 100644
index 000000000..ce9d92987
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
@@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_with_guidance",
+  "category": ["code"],
+  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+  "dependencies": [],
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
new file mode 100644
index 000000000..e5f50c700
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -0,0 +1,31 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.code.code import CodeChallenge
+
+
+class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(
+            os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
+        )
+
+    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 52df3017b..f84df1262 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -5,7 +5,7 @@ from pydantic import BaseModel
 
 
 class Mock(BaseModel):
-    mock_func: str
+    mock_func: Optional[str] = None
     mock_task: Optional[str] = None
 
 
@@ -20,6 +20,7 @@ class Ground(BaseModel):
     should_contain: Optional[List[str]] = None
     should_not_contain: Optional[List[str]] = None
     files: List[str]
+    type: str
 
 
 class ChallengeData(BaseModel):
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
deleted file mode 100644
index da3babb1f..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_1.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-The id to remember is 2314
-Read the file instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
deleted file mode 100644
index 87c870b1a..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_2.txt
+++ /dev/null
@@ -1 +0,0 @@
-Read the file instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
deleted file mode 100644
index 987086208..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_3.txt
+++ /dev/null
@@ -1 +0,0 @@
-Read the file instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
deleted file mode 100644
index 78da710b8..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_4.txt
+++ /dev/null
@@ -1 +0,0 @@
-Read the file instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
deleted file mode 100644
index 8a2d35509..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts/instructions_5.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write the id previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..da3babb1f
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
@@ -0,0 +1,2 @@
+The id to remember is 2314
+Read the file instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..87c870b1a
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
@@ -0,0 +1 @@
+Read the file instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..987086208
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
@@ -0,0 +1 @@
+Read the file instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..78da710b8
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
@@ -0,0 +1 @@
+Read the file instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..8a2d35509
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
@@ -0,0 +1 @@
+Write the id previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json
index 7023f85fc..3e410ac53 100644
--- a/agbenchmark/challenges/memory/m1/m1_data.json
+++ b/agbenchmark/challenges/memory/m1/m1_data.json
@@ -7,7 +7,8 @@
     "answer": "2314",
     "should_contain": ["2314"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_memory_mock",
diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py
index 28e600cc3..c1f370244 100644
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -16,7 +16,9 @@ class TestBasicMemory(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt
deleted file mode 100644
index d304ce632..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts/instructions_1.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 3145. Read the file instructions_2.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt
deleted file mode 100644
index cfff66ba2..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts/instructions_2.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 3791. Read the file instructions_3.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt
deleted file mode 100644
index ad16b6fdc..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts/instructions_3.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 9317. Read the file instructions_4.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt
deleted file mode 100644
index 2394d64bc..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts/instructions_4.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 9471. Read the file instructions_5.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt
deleted file mode 100644
index 92a6fba94..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts/instructions_5.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write the ids previously mentioned to a file called file_to_check.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..d304ce632
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
@@ -0,0 +1 @@
+The id to remember is 3145. Read the file instructions_2.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..cfff66ba2
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
@@ -0,0 +1 @@
+The id to remember is 3791. Read the file instructions_3.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..ad16b6fdc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
@@ -0,0 +1 @@
+The id to remember is 9317. Read the file instructions_4.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..2394d64bc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
@@ -0,0 +1 @@
+The id to remember is 9471. Read the file instructions_5.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..92a6fba94
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
@@ -0,0 +1 @@
+Write the ids previously mentioned to a file called file_to_check.txt.
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
index 374df6165..29d7339b8 100644
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
@@ -7,7 +7,8 @@
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "remember_multiple_ids_mock",
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
index d5f0cf1a9..f0f2b3971 100644
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@@ -20,7 +20,9 @@ class TestRememberMultipleIds(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt
deleted file mode 100644
index 8deee1b09..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts/instructions_1.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
-
-The id to remember is 3145. Read the file instructions_2.txt.
-
-OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt
deleted file mode 100644
index c9b5fc5f9..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts/instructions_2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
-
-The id to remember is 3791. Read the file instructions_3.txt.
-
-BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt
deleted file mode 100644
index a564a7c91..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts/instructions_3.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
-
-The id to remember is 9317. Read the file instructions_4.txt.
-
-8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt
deleted file mode 100644
index 63fcd54fa..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts/instructions_4.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
-
-The id to remember is 9471. Read the file instructions_5.txt.
-
-1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt
deleted file mode 100644
index 1c3f156e3..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts/instructions_5.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
-
-Write the ids previously mentioned to a file called file_to_check.txt.
-
-gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..8deee1b09
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
@@ -0,0 +1,5 @@
+xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
+
+The id to remember is 3145. Read the file instructions_2.txt.
+
+OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..c9b5fc5f9
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
@@ -0,0 +1,5 @@
+2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
+
+The id to remember is 3791. Read the file instructions_3.txt.
+
+BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..a564a7c91
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
@@ -0,0 +1,5 @@
+ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
+
+The id to remember is 9317. Read the file instructions_4.txt.
+
+8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..63fcd54fa
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
@@ -0,0 +1,5 @@
+RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
+
+The id to remember is 9471. Read the file instructions_5.txt.
+
+1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..1c3f156e3
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
@@ -0,0 +1,5 @@
+sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
+
+Write the ids previously mentioned to a file called file_to_check.txt.
+
+gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
index 95c93ccb9..6b53c4571 100644
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
@@ -7,7 +7,8 @@
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "remember_multiple_ids_mock",
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
index 4d2d64957..493ea3574 100644
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@@ -21,7 +21,9 @@ class TestRememberMultipleIdsWithNoise(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt
deleted file mode 100644
index 1b1e0147c..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts/instructions_1.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
-
-The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt.
-
-OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt
deleted file mode 100644
index 92203f5c1..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts/instructions_2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
-
-The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt.
-
-BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt
deleted file mode 100644
index d7f6f08fc..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts/instructions_3.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
-
-The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt.
-
-8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt
deleted file mode 100644
index 6c0a13bae..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts/instructions_4.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
-
-The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt.
-
-1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt
deleted file mode 100644
index 729f0aa0d..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts/instructions_5.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
-
-Write the phrases previously mentioned to a file called file_to_check.txt.
-
-gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..1b1e0147c
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
@@ -0,0 +1,5 @@
+xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
+
+The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt.
+
+OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..92203f5c1
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
@@ -0,0 +1,5 @@
+2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
+
+The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt.
+
+BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..d7f6f08fc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
@@ -0,0 +1,5 @@
+ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
+
+The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt.
+
+8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..6c0a13bae
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
@@ -0,0 +1,5 @@
+RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
+
+The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt.
+
+1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..729f0aa0d
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
@@ -0,0 +1,5 @@
+sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
+
+Write the phrases previously mentioned to a file called file_to_check.txt.
+
+gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
index e1ecb16f4..316ef9476 100644
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
@@ -12,7 +12,8 @@
       "The giant hamster rode a unicycle through the crowded mall"
     ],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"]
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "remember_multiple_phrases_with_noise_mock",
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
index fd33da1c0..e37e9a385 100644
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@@ -21,7 +21,9 @@ class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
index 44fce24ea..8fca01b78 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ b/agbenchmark/challenges/retrieval/r1/r1_data.json
@@ -7,7 +7,8 @@
     "answer": "£25.89",
     "should_contain": ["25.89"],
     "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_retrieval_mock",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index d107d9645..285b8affc 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -16,7 +16,9 @@ class TestRetrieval(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json
index 925e6db83..3c388f192 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_data.json
+++ b/agbenchmark/challenges/retrieval/r2/r2_data.json
@@ -7,7 +7,8 @@
     "answer": "81,462",
     "should_contain": ["81,462"],
     "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_retrieval_2_mock",
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
index a60296ecd..ba727b8ed 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -16,7 +16,9 @@ class TestRetrieval2(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json
index 183529c48..415456155 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_data.json
+++ b/agbenchmark/challenges/retrieval/r3/r3_data.json
@@ -7,7 +7,8 @@
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
     "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
     "should_not_contain": [],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_retrieval_3_mock",
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
index bcd48d33c..b58f42672 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -16,7 +16,9 @@ class TestRetrieval3(RetrievalChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 37ded0ae9..32149eb83 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,18 +1,6 @@
 from agbenchmark.challenge import Challenge
 
 
-def basic_read_file_mock(task: str, workspace: str) -> None:
-    """
-    This mock reads a file and returns its content.
-    """
-
-    file_contents = Challenge.open_file(workspace, "file_to_check.txt")
-
-    Challenge.write_to_file(
-        workspace, "file_to_check.txt", f"random string: {file_contents}"
-    )
-
-
 def basic_write_file_mock(task: str, workspace: str) -> None:
     """
     This mock writes to a file (creates one if it doesn't exist)
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
deleted file mode 100644
index 980a0d5f1..000000000
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts/file_to_check.txt
+++ /dev/null
@@ -1 +0,0 @@
-Hello World!
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
new file mode 100644
index 000000000..980a0d5f1
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
@@ -0,0 +1 @@
+Hello World!
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
new file mode 100644
index 000000000..c1a7879a1
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
@@ -0,0 +1 @@
+random string Hello World!
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
index a74b875a8..7463d22fc 100644
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
@@ -4,9 +4,10 @@
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
   "dependencies": ["basic_write_file"],
   "ground": {
-    "answer": "random string: Hello World!",
-    "should_contain": ["random string: Hello World!"],
-    "files": ["file_to_check.txt"]
+    "answer": "random string Hello World!",
+    "should_contain": ["random string", "Hello World!"],
+    "files": ["file_to_check.txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_read_file_mock"
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index e7f2af9ec..7c38d2832 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -15,7 +15,9 @@ class TestReadFile(BasicChallenge):
     @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
index 358ebb538..9232a45a0 100644
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
@@ -7,7 +7,8 @@
     "answer": "Washington",
     "should_contain": ["Washington"],
     "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"]
+    "files": [".txt"],
+    "type": "file"
   },
   "mock": {
     "mock_func": "basic_write_file_mock",
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 81f72cc9c..474d67127 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -16,7 +16,9 @@ class TestWriteFile(BasicChallenge):
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.open_files(config["workspace"], self.data.ground.files)
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
 
         scores = []
         for file_content in files_contents:
diff --git a/pyproject.toml b/pyproject.toml
index e0d579cab..33a8671cf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ markers = [
     "retrieval",
     "regression",
     "basic",
+    "code",
     "memory"
 ]
 
diff --git a/regression_tests.json b/regression_tests.json
index 1195efbc9..3c8988a1b 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,9 +1,34 @@
 {
+    "TestDebugSimpleTypoWithGuidance": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
+    },
     "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/challenges/memory/m1/m1_test.py"
     },
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+    },
+    "TestRetrieval": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
+    },
     "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
@@ -19,31 +44,11 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
     },
-    "TestRetrieval": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
-    },
     "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
             "basic_write_file"
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
-    },
-    "TestRememberMultipleIds": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
-    },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From 6ef32a9b1f83ee5d628bcbcc9199374b84230a23 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 7 Jul 2023 13:55:59 -0700
Subject: Add "Debug code without guidance" challenge (#66)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .../d1/debug_simple_typo_with_guidance_data.json   |  2 +-
 .../challenges/code/d2/artifacts_in/__init__.py    |  0
 .../challenges/code/d2/artifacts_in/code.py        | 13 +++++++++
 .../challenges/code/d2/artifacts_in/test.py        | 31 +++++++++++++++++++++
 .../challenges/code/d2/artifacts_out/__init__.py   |  0
 .../challenges/code/d2/artifacts_out/code.py       | 12 ++++++++
 .../challenges/code/d2/artifacts_out/test.py       | 31 +++++++++++++++++++++
 agbenchmark/challenges/code/d2/d2_data.json        | 22 +++++++++++++++
 agbenchmark/challenges/code/d2/d2_test.py          | 32 ++++++++++++++++++++++
 agbenchmark/mocks/mock_manager.py                  |  4 +--
 mypy.ini                                           |  1 +
 regression_tests.json                              |  5 ++++
 12 files changed, 150 insertions(+), 3 deletions(-)
 create mode 100644 agbenchmark/challenges/code/d2/artifacts_in/__init__.py
 create mode 100644 agbenchmark/challenges/code/d2/artifacts_in/code.py
 create mode 100644 agbenchmark/challenges/code/d2/artifacts_in/test.py
 create mode 100644 agbenchmark/challenges/code/d2/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d2/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d2/artifacts_out/test.py
 create mode 100644 agbenchmark/challenges/code/d2/d2_data.json
 create mode 100644 agbenchmark/challenges/code/d2/d2_test.py

diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
index ce9d92987..c29c3d83a 100644
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
@@ -4,7 +4,7 @@
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": [],
   "ground": {
-    "answer": "2314",
+    "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
     "should_not_contain": [],
     "files": ["test.py"],
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2/artifacts_in/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2/artifacts_in/code.py
new file mode 100644
index 000000000..df8120bfa
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_in/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2/artifacts_in/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_in/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2/artifacts_out/code.py
new file mode 100644
index 000000000..de3d8c62c
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2/artifacts_out/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/artifacts_out/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2/d2_data.json b/agbenchmark/challenges/code/d2/d2_data.json
new file mode 100644
index 000000000..6003055a8
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/d2_data.json
@@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_without_guidance",
+  "category": ["code"],
+  "task": "Make test.py run without errors.",
+  "dependencies": [],
+  "ground": {
+    "answer": "[0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py
new file mode 100644
index 000000000..d49f9dfe9
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@@ -0,0 +1,32 @@
+import os
+from typing import Any, Dict
+
+import pytest
+
+from agbenchmark.challenges.code.code import CodeChallenge
+
+
+class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
+    """The first memory challenge"""
+
+    def get_file_path(self) -> str:  # all tests must implement this method
+        return os.path.join(os.path.dirname(__file__), "d2_data.json")
+
+    @pytest.mark.depends(
+        name="test_debug_simple_typo_without_guidance",
+        depends=["test_debug_simple_typo_with_guidance"],
+    )
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        assert 1 in scores
diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py
index 5b84965c3..57c03405d 100644
--- a/agbenchmark/mocks/mock_manager.py
+++ b/agbenchmark/mocks/mock_manager.py
@@ -1,11 +1,11 @@
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import agbenchmark.mocks.tests.basic_mocks as basic_mocks
 import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
 
 
 class MockManager:
-    def __init__(self, task: str, config: Dict[str, Any]) -> None:
+    def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
         self.task = task
         self.workspace = config["workspace"]
         self.modules = [basic_mocks, retrieval_mocks]
diff --git a/mypy.ini b/mypy.ini
index ceb13fcd2..764c239f1 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -1,4 +1,5 @@
 [mypy]
+namespace_packages = True
 follow_imports = skip
 check_untyped_defs = True
 disallow_untyped_defs = True
diff --git a/regression_tests.json b/regression_tests.json
index 3c8988a1b..59a9694bf 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -50,5 +50,10 @@
             "basic_write_file"
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
+    },
+    "TestDebugSimpleTypoWithoutGuidance": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d2/d2_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From e61523e59ed1a5582ce4a81699faef5bc36bcd16 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 7 Jul 2023 13:58:17 -0700
Subject: Get rid of get file path by using the data.json convention to store
 the challenge information (#67)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 agbenchmark/README.md                              |  3 ---
 agbenchmark/challenge.py                           | 16 ++++---------
 agbenchmark/challenges/code/d1/data.json           | 22 ++++++++++++++++++
 .../d1/debug_simple_typo_with_guidance_data.json   | 22 ------------------
 .../d1/debug_simple_typo_with_guidance_test.py     |  6 -----
 agbenchmark/challenges/code/d2/d2_data.json        | 22 ------------------
 agbenchmark/challenges/code/d2/d2_test.py          |  4 ----
 agbenchmark/challenges/code/d2/data.json           | 22 ++++++++++++++++++
 agbenchmark/challenges/memory/m1/data.json         | 22 ++++++++++++++++++
 agbenchmark/challenges/memory/m1/m1_data.json      | 22 ------------------
 agbenchmark/challenges/memory/m1/m1_test.py        |  4 ----
 agbenchmark/challenges/memory/m2/data.json         | 22 ++++++++++++++++++
 .../memory/m2/remember_multiple_ids_data.json      | 22 ------------------
 .../memory/m2/remember_multiple_ids_test.py        |  6 -----
 agbenchmark/challenges/memory/m3/data.json         | 22 ++++++++++++++++++
 .../m3/remember_multiple_ids_with_noise_data.json  | 22 ------------------
 .../m3/remember_multiple_ids_with_noise_test.py    |  6 -----
 agbenchmark/challenges/memory/m4/data.json         | 27 ++++++++++++++++++++++
 .../remember_multiple_phrases_with_noise_data.json | 27 ----------------------
 .../remember_multiple_phrases_with_noise_test.py   |  6 -----
 agbenchmark/challenges/retrieval/r1/data.json      | 22 ++++++++++++++++++
 agbenchmark/challenges/retrieval/r1/r1_data.json   | 22 ------------------
 agbenchmark/challenges/retrieval/r1/r1_test.py     |  4 ----
 agbenchmark/challenges/retrieval/r2/data.json      | 22 ++++++++++++++++++
 agbenchmark/challenges/retrieval/r2/r2_data.json   | 22 ------------------
 agbenchmark/challenges/retrieval/r2/r2_test.py     |  4 ----
 agbenchmark/challenges/retrieval/r3/data.json      | 22 ++++++++++++++++++
 agbenchmark/challenges/retrieval/r3/r3_data.json   | 22 ------------------
 agbenchmark/challenges/retrieval/r3/r3_test.py     |  4 ----
 .../tests/basic_abilities/read_file/data.json      | 20 ++++++++++++++++
 .../basic_abilities/read_file/r_file_data.json     | 20 ----------------
 .../basic_abilities/read_file/read_file_test.py    |  4 ----
 .../tests/basic_abilities/write_file/data.json     | 22 ++++++++++++++++++
 .../basic_abilities/write_file/w_file_data.json    | 22 ------------------
 .../basic_abilities/write_file/write_file_test.py  |  4 ----
 35 files changed, 249 insertions(+), 312 deletions(-)
 create mode 100644 agbenchmark/challenges/code/d1/data.json
 delete mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
 delete mode 100644 agbenchmark/challenges/code/d2/d2_data.json
 create mode 100644 agbenchmark/challenges/code/d2/data.json
 create mode 100644 agbenchmark/challenges/memory/m1/data.json
 delete mode 100644 agbenchmark/challenges/memory/m1/m1_data.json
 create mode 100644 agbenchmark/challenges/memory/m2/data.json
 delete mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
 create mode 100644 agbenchmark/challenges/memory/m3/data.json
 delete mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
 create mode 100644 agbenchmark/challenges/memory/m4/data.json
 delete mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
 create mode 100644 agbenchmark/challenges/retrieval/r1/data.json
 delete mode 100644 agbenchmark/challenges/retrieval/r1/r1_data.json
 create mode 100644 agbenchmark/challenges/retrieval/r2/data.json
 delete mode 100644 agbenchmark/challenges/retrieval/r2/r2_data.json
 create mode 100644 agbenchmark/challenges/retrieval/r3/data.json
 delete mode 100644 agbenchmark/challenges/retrieval/r3/r3_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/read_file/data.json
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file/r_file_data.json
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/data.json
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file/w_file_data.json

diff --git a/agbenchmark/README.md b/agbenchmark/README.md
index a478f83f3..01f602dc6 100644
--- a/agbenchmark/README.md
+++ b/agbenchmark/README.md
@@ -53,9 +53,6 @@ import os
 class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
-
     @pytest.mark.depends(on=[], name="basic_write_file")
     def test_method(self, workspace):
         # implement scoring logic by looking at workspace
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index 4c8e69848..29bc3ff91 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -3,7 +3,7 @@ import inspect
 import os
 import subprocess
 import types
-from abc import ABC, ABCMeta, abstractmethod
+from abc import ABC, ABCMeta
 from typing import Any, Dict, List, Optional, Tuple, Type, cast
 
 import pytest
@@ -35,20 +35,12 @@ class Challenge(ABC, metaclass=ChallengeMeta):
     Defines helper methods for running a challenge"""
 
     _data_cache: Dict[str, ChallengeData] = {}
-
-    @abstractmethod
-    def get_file_path(self) -> str:
-        """This should be implemented by any class which inherits from BasicChallenge"""
-        pass
+    CHALLENGE_LOCATION: str
 
     @property
     def data(self) -> ChallengeData:
-        "Check if the data is already loaded, if not load it"
-        file_path = (
-            self.get_file_path()
-        )  # file_path serves as the key in the cache dictionary
-        if file_path not in Challenge._data_cache:
-            Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
+        file_path = f"{self.CHALLENGE_LOCATION}/data.json"
+        Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
         return Challenge._data_cache[file_path]
 
     @property
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
new file mode 100644
index 000000000..c29c3d83a
--- /dev/null
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_with_guidance",
+  "category": ["code"],
+  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+  "dependencies": [],
+  "ground": {
+    "answer": "[0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
deleted file mode 100644
index c29c3d83a..000000000
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "debug_simple_typo_with_guidance",
-  "category": ["code"],
-  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
-  "dependencies": [],
-  "ground": {
-    "answer": "[0, 1] [2, 5] [0, 3]",
-    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
-    "should_not_contain": [],
-    "files": ["test.py"],
-    "type": "execute_python_code"
-  },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
index e5f50c700..16a12ae41 100644
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,11 +8,6 @@ from agbenchmark.challenges.code.code import CodeChallenge
 class TestDebugSimpleTypoWithGuidance(CodeChallenge):
     """The first memory challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(
-            os.path.dirname(__file__), "debug_simple_typo_with_guidance_data.json"
-        )
-
     @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
diff --git a/agbenchmark/challenges/code/d2/d2_data.json b/agbenchmark/challenges/code/d2/d2_data.json
deleted file mode 100644
index 6003055a8..000000000
--- a/agbenchmark/challenges/code/d2/d2_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "debug_simple_typo_without_guidance",
-  "category": ["code"],
-  "task": "Make test.py run without errors.",
-  "dependencies": [],
-  "ground": {
-    "answer": "[0, 1] [2, 5] [0, 3]",
-    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
-    "should_not_contain": [],
-    "files": ["test.py"],
-    "type": "execute_python_code"
-  },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py
index d49f9dfe9..7a5988b94 100644
--- a/agbenchmark/challenges/code/d2/d2_test.py
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,9 +8,6 @@ from agbenchmark.challenges.code.code import CodeChallenge
 class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
     """The first memory challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "d2_data.json")
-
     @pytest.mark.depends(
         name="test_debug_simple_typo_without_guidance",
         depends=["test_debug_simple_typo_with_guidance"],
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
new file mode 100644
index 000000000..6003055a8
--- /dev/null
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "debug_simple_typo_without_guidance",
+  "category": ["code"],
+  "task": "Make test.py run without errors.",
+  "dependencies": [],
+  "ground": {
+    "answer": "[0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "mock": {
+    "mock_func": null,
+    "mock_task": null
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
new file mode 100644
index 000000000..3e410ac53
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "basic_memory",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["2314"],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "basic_memory_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m1/m1_data.json b/agbenchmark/challenges/memory/m1/m1_data.json
deleted file mode 100644
index 3e410ac53..000000000
--- a/agbenchmark/challenges/memory/m1/m1_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "basic_memory",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
-  "ground": {
-    "answer": "2314",
-    "should_contain": ["2314"],
-    "should_not_contain": [],
-    "files": ["file_to_check.txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_memory_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py
index c1f370244..9e5e0a775 100644
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,9 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge
 class TestBasicMemory(MemoryChallenge):
     """The first memory challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "m1_data.json")
-
     @pytest.mark.depends(name="test_basic_memory")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
new file mode 100644
index 000000000..29d7339b8
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "remember_multiple_ids",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "3145\n3791\n9317\n9471",
+    "should_contain": ["3145", "3791", "9317", "9471"],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "remember_multiple_ids_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json b/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
deleted file mode 100644
index 29d7339b8..000000000
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "remember_multiple_ids",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
-  "ground": {
-    "answer": "3145\n3791\n9317\n9471",
-    "should_contain": ["3145", "3791", "9317", "9471"],
-    "should_not_contain": [],
-    "files": ["file_to_check.txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
index f0f2b3971..6ba38dad3 100644
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,11 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge
 class TestRememberMultipleIds(MemoryChallenge):
     """The first memory challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(
-            os.path.dirname(__file__), "remember_multiple_ids_data.json"
-        )
-
     @pytest.mark.depends(
         name="test_remember_multiple_ids", depends=["test_basic_memory"]
     )
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
new file mode 100644
index 000000000..6b53c4571
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "remember_multiple_ids_with_noise_mock",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "3145\n3791\n9317\n9471",
+    "should_contain": ["3145", "3791", "9317", "9471"],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "remember_multiple_ids_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "medium",
+    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
deleted file mode 100644
index 6b53c4571..000000000
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "remember_multiple_ids_with_noise_mock",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
-  "ground": {
-    "answer": "3145\n3791\n9317\n9471",
-    "should_contain": ["3145", "3791", "9317", "9471"],
-    "should_not_contain": [],
-    "files": ["file_to_check.txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
-  "info": {
-    "difficulty": "medium",
-    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
index 493ea3574..037a6929e 100644
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,11 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge
 class TestRememberMultipleIdsWithNoise(MemoryChallenge):
     """The first memory challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(
-            os.path.dirname(__file__), "remember_multiple_ids_with_noise_data.json"
-        )
-
     @pytest.mark.depends(
         name="test_remember_multiple_ids_with_noise",
         depends=["test_remember_multiple_ids"],
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
new file mode 100644
index 000000000..316ef9476
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -0,0 +1,27 @@
+{
+  "name": "remember_multiple_phrases_with_noise_mock",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+    "should_contain": [
+      "The purple elephant danced on a rainbow while eating a taco",
+      "The sneaky toaster stole my socks and ran away to Hawaii",
+      "My pet rock sings better than Beyoncé on Tuesdays",
+      "The giant hamster rode a unicycle through the crowded mall"
+    ],
+    "should_not_contain": [],
+    "files": ["file_to_check.txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "remember_multiple_phrases_with_noise_mock",
+    "mock_task": "Follow the instructions in the instructions_1.txt file"
+  },
+  "info": {
+    "difficulty": "medium",
+    "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
deleted file mode 100644
index 316ef9476..000000000
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_data.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "name": "remember_multiple_phrases_with_noise_mock",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
-  "ground": {
-    "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
-    "should_contain": [
-      "The purple elephant danced on a rainbow while eating a taco",
-      "The sneaky toaster stole my socks and ran away to Hawaii",
-      "My pet rock sings better than Beyoncé on Tuesdays",
-      "The giant hamster rode a unicycle through the crowded mall"
-    ],
-    "should_not_contain": [],
-    "files": ["file_to_check.txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "remember_multiple_phrases_with_noise_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
-  "info": {
-    "difficulty": "medium",
-    "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
index e37e9a385..2c931af8c 100644
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,11 +8,6 @@ from agbenchmark.challenges.memory.memory import MemoryChallenge
 class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
     """The first memory challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(
-            os.path.dirname(__file__), "remember_multiple_phrases_with_noise_data.json"
-        )
-
     @pytest.mark.depends(
         name="test_remember_multiple_phrases_with_noise",
         depends=["test_remember_multiple_ids_with_noise"],
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
new file mode 100644
index 000000000..8fca01b78
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "basic_information_retrieval",
+  "category": ["retrieval"],
+  "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+  "dependencies": [],
+  "ground": {
+    "answer": "£25.89",
+    "should_contain": ["25.89"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "basic_retrieval_mock",
+    "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability to retrieve information from a website.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r1/r1_data.json b/agbenchmark/challenges/retrieval/r1/r1_data.json
deleted file mode 100644
index 8fca01b78..000000000
--- a/agbenchmark/challenges/retrieval/r1/r1_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "basic_information_retrieval",
-  "category": ["retrieval"],
-  "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
-  "dependencies": [],
-  "ground": {
-    "answer": "£25.89",
-    "should_contain": ["25.89"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_retrieval_mock",
-    "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability to retrieve information from a website.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 285b8affc..68d3de4e3 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,9 +8,6 @@ from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 class TestRetrieval(RetrievalChallenge):
     """The first information-retrieval challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "r1_data.json")
-
     @pytest.mark.depends(name="test_retrieval")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
new file mode 100644
index 000000000..3c388f192
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "basic_information_retrieval",
+  "category": ["retrieval"],
+  "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+  "dependencies": [],
+  "ground": {
+    "answer": "81,462",
+    "should_contain": ["81,462"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "basic_retrieval_2_mock",
+    "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability to retrieve information.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r2/r2_data.json b/agbenchmark/challenges/retrieval/r2/r2_data.json
deleted file mode 100644
index 3c388f192..000000000
--- a/agbenchmark/challenges/retrieval/r2/r2_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "basic_information_retrieval",
-  "category": ["retrieval"],
-  "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
-  "ground": {
-    "answer": "81,462",
-    "should_contain": ["81,462"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_retrieval_2_mock",
-    "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability to retrieve information.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
index ba727b8ed..5a1a20690 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,9 +8,6 @@ from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 class TestRetrieval2(RetrievalChallenge):
     """The first information-retrieval challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "r2_data.json")
-
     @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
new file mode 100644
index 000000000..415456155
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "basic_information_retrieval",
+  "category": ["retrieval"],
+  "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+  "dependencies": [],
+  "ground": {
+    "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+    "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "basic_retrieval_3_mock",
+    "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability to retrieve information.",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r3/r3_data.json b/agbenchmark/challenges/retrieval/r3/r3_data.json
deleted file mode 100644
index 415456155..000000000
--- a/agbenchmark/challenges/retrieval/r3/r3_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "basic_information_retrieval",
-  "category": ["retrieval"],
-  "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
-  "ground": {
-    "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
-    "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_retrieval_3_mock",
-    "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability to retrieve information.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
index b58f42672..c4b4bcf12 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,9 +8,6 @@ from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 class TestRetrieval3(RetrievalChallenge):
     """The first information-retrieval challenge"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "r3_data.json")
-
     @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
diff --git a/agbenchmark/tests/basic_abilities/read_file/data.json b/agbenchmark/tests/basic_abilities/read_file/data.json
new file mode 100644
index 000000000..7463d22fc
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/read_file/data.json
@@ -0,0 +1,20 @@
+{
+  "name": "basic_read_file",
+  "category": ["basic"],
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": ["basic_write_file"],
+  "ground": {
+    "answer": "random string Hello World!",
+    "should_contain": ["random string", "Hello World!"],
+    "files": ["file_to_check.txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "basic_read_file_mock"
+  },
+  "info": {
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json b/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
deleted file mode 100644
index 7463d22fc..000000000
--- a/agbenchmark/tests/basic_abilities/read_file/r_file_data.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-  "name": "basic_read_file",
-  "category": ["basic"],
-  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["basic_write_file"],
-  "ground": {
-    "answer": "random string Hello World!",
-    "should_contain": ["random string", "Hello World!"],
-    "files": ["file_to_check.txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_read_file_mock"
-  },
-  "info": {
-    "description": "This reads the file quickly",
-    "difficulty": "basic",
-    "side_effects": [""]
-  }
-}
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
index 7c38d2832..cf5dceb69 100644
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,9 +8,6 @@ from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
 class TestReadFile(BasicChallenge):
     """Testing if LLM can read a file"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "r_file_data.json")
-
     @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
diff --git a/agbenchmark/tests/basic_abilities/write_file/data.json b/agbenchmark/tests/basic_abilities/write_file/data.json
new file mode 100644
index 000000000..9232a45a0
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/data.json
@@ -0,0 +1,22 @@
+{
+  "name": "basic_write_file",
+  "category": ["basic"],
+  "task": "Print the the capital of America to a .txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "mock": {
+    "mock_func": "basic_write_file_mock",
+    "mock_task": "What is the capital of America?"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json b/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
deleted file mode 100644
index 9232a45a0..000000000
--- a/agbenchmark/tests/basic_abilities/write_file/w_file_data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "basic_write_file",
-  "category": ["basic"],
-  "task": "Print the the capital of America to a .txt file",
-  "dependencies": [],
-  "ground": {
-    "answer": "Washington",
-    "should_contain": ["Washington"],
-    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests the writing to file",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
index 474d67127..ba0395186 100644
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
@@ -1,4 +1,3 @@
-import os
 from typing import Any, Dict
 
 import pytest
@@ -9,9 +8,6 @@ from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
 class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
-    def get_file_path(self) -> str:  # all tests must implement this method
-        return os.path.join(os.path.dirname(__file__), "w_file_data.json")
-
     @pytest.mark.depends(name="basic_write_file")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
-- 
cgit v1.2.3


From 4562bc6caf5008d65ccd7f0cb38df0521039cada Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Fri, 7 Jul 2023 17:54:09 -0400
Subject: Update data.json remove text

---
 agbenchmark/challenges/memory/m1/data.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
index 3e410ac53..c7d441903 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -7,7 +7,7 @@
     "answer": "2314",
     "should_contain": ["2314"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"],
+    "files": [".txt"],
     "type": "file"
   },
   "mock": {
-- 
cgit v1.2.3


From 3defe044bd55b050e5d00b3fe9a3f4ce3dca6555 Mon Sep 17 00:00:00 2001
From: Erik Peterson <e@eriklp.com>
Date: Fri, 7 Jul 2023 15:02:08 -0700
Subject: Print out all of stdout on each process poll. (#69)

---
 agbenchmark/agent_interface.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 05540f6d3..77eb110bf 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -61,8 +61,9 @@ def run_agent(
         while True:
             if process.stdout is None:
                 continue
-            output = process.stdout.readline()
-            print(output.strip())
+
+            while output := process.stdout.readline():
+                print(output.strip())
 
             # Check if process has ended
             if process.poll() is not None:
-- 
cgit v1.2.3


From e34c83ca1c516e84a75d81b775723cd6346776ee Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 7 Jul 2023 15:34:57 -0700
Subject: Add .txt to memory challenges (#70)

---
 agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt | 2 +-
 agbenchmark/challenges/memory/m2/data.json                       | 2 +-
 agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt | 2 +-
 agbenchmark/challenges/memory/m3/data.json                       | 2 +-
 agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt | 2 +-
 agbenchmark/challenges/memory/m4/data.json                       | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
index 92a6fba94..2f60afa17 100644
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
@@ -1 +1 @@
-Write the ids previously mentioned to a file called file_to_check.txt.
+Write the ids previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
index 29d7339b8..6e898298a 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -7,7 +7,7 @@
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"],
+    "files": [".txt"],
     "type": "file"
   },
   "mock": {
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
index 1c3f156e3..db609b4b9 100644
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
@@ -1,5 +1,5 @@
 sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
 
-Write the ids previously mentioned to a file called file_to_check.txt.
+Write the ids previously mentioned to a to a .txt file.
 
 gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
index 6b53c4571..2a4f06ff7 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -7,7 +7,7 @@
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"],
+    "files": [".txt"],
     "type": "file"
   },
   "mock": {
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
index 729f0aa0d..ee490e6c9 100644
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
@@ -1,5 +1,5 @@
 sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
 
-Write the phrases previously mentioned to a file called file_to_check.txt.
+Write the phrases previously mentioned to a to a .txt file.
 
 gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
index 316ef9476..adfd8e33f 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -12,7 +12,7 @@
       "The giant hamster rode a unicycle through the crowded mall"
     ],
     "should_not_contain": [],
-    "files": ["file_to_check.txt"],
+    "files": [".txt"],
     "type": "file"
   },
   "mock": {
-- 
cgit v1.2.3


From f0f7d2be900ef9349b5dcd674c1ae862649c7f0a Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 7 Jul 2023 15:38:50 -0700
Subject: Fix memory challenge 2 (#71)

---
 agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt | 2 +-
 agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
index db609b4b9..99c9efa35 100644
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
@@ -1,5 +1,5 @@
 sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
 
-Write the ids previously mentioned to a to a .txt file.
+Write the ids previously mentioned to a .txt file.
 
 gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
index ee490e6c9..b4ef60bad 100644
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
+++ b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
@@ -1,5 +1,5 @@
 sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
 
-Write the phrases previously mentioned to a to a .txt file.
+Write the phrases previously mentioned to a .txt file.
 
 gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
-- 
cgit v1.2.3


From 487f99f8f2d0e1294bd261da5650dfb98d6e884c Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 7 Jul 2023 15:49:37 -0700
Subject: Use artifacts out insted of python code (#72)

---
 .../memory/m1/artifacts_out/random_file.txt        |  1 +
 .../memory/m2/artifacts_out/random_file.txt        |  4 ++
 .../memory/m3/artifacts_out/random_file.txt        |  4 ++
 .../memory/m4/artifacts_out/random_file.txt        |  4 ++
 .../retrieval/r1/artifacts_out/random_file.txt     |  1 +
 .../retrieval/r2/artifacts_out/random_file.txt     |  1 +
 .../retrieval/r3/artifacts_out/random_file.txt     | 15 +++++
 agbenchmark/mocks/mock_manager.py                  |  3 +-
 agbenchmark/mocks/tests/basic_mocks.py             | 70 +---------------------
 agbenchmark/mocks/tests/retrieval_mocks.py         |  5 --
 .../write_file/artifacts_out/random_file.txt       |  1 +
 regression_tests.json                              | 24 ++++----
 12 files changed, 46 insertions(+), 87 deletions(-)
 create mode 100644 agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/mocks/tests/retrieval_mocks.py
 create mode 100644 agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt

diff --git a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt
new file mode 100644
index 000000000..86be9d159
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+2314
diff --git a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt
new file mode 100644
index 000000000..7d48aaf18
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt
@@ -0,0 +1,4 @@
+3145
+3791
+9317
+9471
diff --git a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt
new file mode 100644
index 000000000..7d48aaf18
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt
@@ -0,0 +1,4 @@
+3145
+3791
+9317
+9471
diff --git a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
new file mode 100644
index 000000000..9b8405bf1
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
@@ -0,0 +1,4 @@
+The purple elephant danced on a rainbow while eating a taco
+The sneaky toaster stole my socks and ran away to Hawaii
+My pet rock sings better than Beyoncé on Tuesdays
+The giant hamster rode a unicycle through the crowded mall
diff --git a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt
new file mode 100644
index 000000000..f558a0f94
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+25.89
diff --git a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt
new file mode 100644
index 000000000..8a0eae046
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+81,462 Millions
diff --git a/agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt
new file mode 100644
index 000000000..d8d5bd162
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r3/artifacts_out/random_file.txt
@@ -0,0 +1,15 @@
+15 Millions
+112 Millions
+117 Millions
+204 Millions
+413 Millions
+2,014 Millions
+3,198 Millions
+4,046 Millions
+7,000 Millions
+11,759 Millions
+21,461 Millions
+24,578 Millions
+31,536 Millions
+53,823 Millions
+81,462 Millions
diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py
index 57c03405d..3a227e49b 100644
--- a/agbenchmark/mocks/mock_manager.py
+++ b/agbenchmark/mocks/mock_manager.py
@@ -1,14 +1,13 @@
 from typing import Any, Dict, Optional
 
 import agbenchmark.mocks.tests.basic_mocks as basic_mocks
-import agbenchmark.mocks.tests.retrieval_mocks as retrieval_mocks
 
 
 class MockManager:
     def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
         self.task = task
         self.workspace = config["workspace"]
-        self.modules = [basic_mocks, retrieval_mocks]
+        self.modules = [basic_mocks]
 
     def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
         if hasattr(self, mock_function_name):
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
index 32149eb83..e4a1dedc0 100644
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ b/agbenchmark/mocks/tests/basic_mocks.py
@@ -1,78 +1,12 @@
 from agbenchmark.challenge import Challenge
 
 
-def basic_write_file_mock(task: str, workspace: str) -> None:
+def example_mock(task: str, workspace: str) -> None:
     """
     This mock writes to a file (creates one if it doesn't exist)
     """
     Challenge.write_to_file(
         workspace,
         "file_to_check.txt",
-        "Washington DC is the capital of the United States of America",
-    )
-
-
-def basic_retrieval_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "25.89",
-    )
-
-
-def basic_retrieval_2_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "81,462",
-    )
-
-
-def basic_retrieval_3_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
-    )
-
-
-def basic_memory_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "2314",
-    )
-
-
-def remember_multiple_ids_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "3145\n3791\n9317\n9471",
-    )
-
-
-def remember_multiple_phrases_with_noise_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+        "This is an example showing how you can use mocks but here you can use artifacts_out folder instead of a mock.",
     )
diff --git a/agbenchmark/mocks/tests/retrieval_mocks.py b/agbenchmark/mocks/tests/retrieval_mocks.py
deleted file mode 100644
index 9a8a57db4..000000000
--- a/agbenchmark/mocks/tests/retrieval_mocks.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# TODO: Make it so that you can specify for tests to only run if their prerequisites are met.
-# Prerequisites here would be writing to a file (basic_abilities test).
-# Should also check if prerequisites exists in regression file
-def retrieval_1_mock(task: str, workspace: str) -> None:
-    pass
diff --git a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt b/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
new file mode 100644
index 000000000..1f275fb98
--- /dev/null
+++ b/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+Washington
diff --git a/regression_tests.json b/regression_tests.json
index 59a9694bf..9714d42a8 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -4,6 +4,11 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
     },
+    "TestDebugSimpleTypoWithoutGuidance": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d2/d2_test.py"
+    },
     "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [],
@@ -19,11 +24,6 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
     },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
-    },
     "TestRetrieval": {
         "difficulty": "basic",
         "dependencies": [],
@@ -39,11 +39,6 @@
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
     },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
-    },
     "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
@@ -51,9 +46,14 @@
         ],
         "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
     },
-    "TestDebugSimpleTypoWithoutGuidance": {
+    "TestRetrieval3": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/challenges/code/d2/d2_test.py"
+        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From e56b112aabbd862c97db48dd5d60d09efbedd5b7 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 8 Jul 2023 03:27:31 -0400
Subject: i/o workspace, adding superagi (#60)

---
 .github/workflows/superagi.yml | 62 ++++++++++++++++++++++++++++++++++++++++++
 .gitmodules                    |  4 +++
 agbenchmark/agent_interface.py |  2 +-
 agbenchmark/challenge.py       | 10 +++----
 agbenchmark/conftest.py        | 57 +++++++++++++++++++++++++-------------
 agent/SuperAGI                 |  1 +
 config.json                    |  6 ++--
 7 files changed, 114 insertions(+), 28 deletions(-)
 create mode 100644 .github/workflows/superagi.yml
 create mode 160000 agent/SuperAGI

diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml
new file mode 100644
index 000000000..128c28dd7
--- /dev/null
+++ b/.github/workflows/superagi.yml
@@ -0,0 +1,62 @@
+name: SuperAgi Regression Test
+
+on:
+  workflow_dispatch:
+    branches: [master]
+  push:
+    branches: [stable, master, ci-test*]
+
+jobs:
+  regression-tests:
+    permissions:
+      pull-requests: write
+      contents: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ['3.10']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Set up venv and install Python dependencies
+        run: |
+          poetry install --only main
+          poetry build
+
+      - name: Run regression tests
+        run: |
+          cd agent/SuperAgi
+          cp config_template.yaml config.yaml
+          sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
+          docker-compose up --build
+          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
+          agbenchmark start --reg
diff --git a/.gitmodules b/.gitmodules
index 5af445f7a..f14b5e07d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -14,3 +14,7 @@
 	path = agent/smol-developer
 	url = https://github.com/merwanehamadi/developer.git
 	branch = benchmark-integration
+[submodule "agent/SuperAGI"]
+	path = agent/SuperAGI
+	url = https://github.com/SilenNaihin/SuperAGI.git
+	branch = benchmark-integration
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 77eb110bf..4244fa082 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -15,7 +15,7 @@ MOCK_FLAG = os.getenv("MOCK_TEST")
 
 
 def run_agent(
-    task: Optional[str],
+    task: str,
     mock_func: Optional[str],
     config: Dict[str, Any],
     challenge_location: str,
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index 29bc3ff91..d7e1c8965 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -48,8 +48,8 @@ class Challenge(ABC, metaclass=ChallengeMeta):
         return self.data.mock.mock_func if self.data.mock else None
 
     @property
-    def task(self) -> Optional[str]:
-        return (
+    def task(self) -> str:
+        return str(
             self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
         )
 
@@ -80,13 +80,13 @@ class Challenge(ABC, metaclass=ChallengeMeta):
 
     @staticmethod
     def open_file(workspace: str, filename: str) -> str:
-        script_dir = os.path.abspath(workspace)
+        script_dir = workspace
         workspace_dir = os.path.join(script_dir, filename)
         with open(workspace_dir, "r") as f:
             return f.read()
 
     def get_artifacts_out(self, workspace: str, file_patterns: list) -> List[str]:
-        script_dir = os.path.abspath(workspace)
+        script_dir = workspace
         files_contents = []
 
         for file_pattern in file_patterns:
@@ -115,7 +115,7 @@ class Challenge(ABC, metaclass=ChallengeMeta):
 
     @staticmethod
     def write_to_file(workspace: str, filename: str, content: str) -> None:
-        script_dir = os.path.abspath(workspace)
+        script_dir = workspace
         print("Writing file at", script_dir)
         workspace_dir = os.path.join(script_dir, filename)
 
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 7203ee6bb..40457fb67 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -10,19 +10,24 @@ from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
 from agbenchmark.tests.regression.RegressionManager import RegressionManager
 
 
-def get_dynamic_workspace(config: Dict[str, Any]) -> str:
-    # Extract the string inside ${...}
-    path_expr = config["workspace"][2:-1]
+def resolve_workspace(config: Dict[str, Any]) -> str:
+    if config.get("workspace", "").startswith("${") and config.get(
+        "workspace", ""
+    ).endswith("}"):
+        # Extract the string inside ${...}
+        path_expr = config["workspace"][2:-1]
 
-    # Check if it starts with "os.path.join"
-    if path_expr.strip().startswith("os.path.join"):
-        # Evaluate the path string
-        path_value = eval(path_expr)
+        # Check if it starts with "os.path.join"
+        if path_expr.strip().startswith("os.path.join"):
+            # Evaluate the path string
+            path_value = eval(path_expr)
 
-        # Replace the original string with the evaluated result
-        return path_value
+            # Replace the original string with the evaluated result
+            return path_value
+        else:
+            raise ValueError("Invalid workspace path expression.")
     else:
-        raise ValueError("Invalid workspace path expression.")
+        return os.path.abspath(Path(os.getcwd()) / config["workspace"])
 
 
 @pytest.fixture(scope="module")
@@ -31,22 +36,36 @@ def config(request: Any) -> None:
     with open(CONFIG_PATH, "r") as f:
         config = json.load(f)
 
-    if config.get("workspace", "").startswith("${") and config.get(
-        "workspace", ""
-    ).endswith("}"):
-        path = get_dynamic_workspace(config)
-        config["workspace"] = path
-    else:
-        config["workspace"] = Path(os.getcwd()) / config["workspace"]
+    if request.config.getoption("--mock"):
+        config["workspace"] = "agbenchmark/mocks/workspace"
+    elif isinstance(config["workspace"], str):
+        config["workspace"] = resolve_workspace(config)
+    else:  # it's a input output dict
+        config["workspace"]["input"] = resolve_workspace(config)
+        config["workspace"]["output"] = resolve_workspace(config)
+
     return config
 
 
 @pytest.fixture(scope="module", autouse=True)
 def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
+    output_path = config["workspace"]
+
+    # checks if its an input output paradigm
+    if not isinstance(config["workspace"], str):
+        output_path = config["workspace"]["output"]
+        if not os.path.exists(config["workspace"]["input"]):
+            os.makedirs(config["workspace"]["input"], exist_ok=True)
+
+    # create output directory if it doesn't exist
+    if not os.path.exists(output_path):
+        os.makedirs(output_path, exist_ok=True)
+
     yield config["workspace"]
     # teardown after test function completes
-    for filename in os.listdir(config["workspace"]):
-        file_path = os.path.join(config["workspace"], filename)
+
+    for filename in os.listdir(output_path):
+        file_path = os.path.join(output_path, filename)
         try:
             if os.path.isfile(file_path) or os.path.islink(file_path):
                 os.unlink(file_path)
diff --git a/agent/SuperAGI b/agent/SuperAGI
new file mode 160000
index 000000000..12e248e90
--- /dev/null
+++ b/agent/SuperAGI
@@ -0,0 +1 @@
+Subproject commit 12e248e90112e50ee011f0dcb1b3fa02030661a4
diff --git a/config.json b/config.json
index 378e69025..88526a134 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
-  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "benchmarks.py",
-  "home_path": "agent/mini-agi/",
+  "workspace": "projects/my-new-project/workspace",
+  "entry_path": "agent/gpt-engineer/benchmarks.py",
+  "home_path": "agent/gpt-engineer",
   "cutoff": 60
 }
-- 
cgit v1.2.3


From 082a87661224d25ed969557113e08f84febfbc12 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 8 Jul 2023 05:04:06 -0400
Subject: fixing the incorrect addition of superagi (#73)

---
 .github/workflows/superagi.yml | 6 +++---
 agent/SuperAGI                 | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml
index 128c28dd7..5ab52d33f 100644
--- a/.github/workflows/superagi.yml
+++ b/.github/workflows/superagi.yml
@@ -54,9 +54,9 @@ jobs:
 
       - name: Run regression tests
         run: |
-          cd agent/SuperAgi
+          cd agent/SuperAGI
           cp config_template.yaml config.yaml
           sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
-          docker-compose up --build
+          docker-compose up -d --build
           pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --reg
+          agbenchmark start --maintain
diff --git a/agent/SuperAGI b/agent/SuperAGI
index 12e248e90..7ab2994d4 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit 12e248e90112e50ee011f0dcb1b3fa02030661a4
+Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916
-- 
cgit v1.2.3


From a35569a77b7b9f9048d340646caa1c853b39a501 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 8 Jul 2023 12:47:48 -0400
Subject: submodule integration

---
 .github/workflows/agentgpt.yml | 66 ++++++++++++++++++++++++++++++++++++++++++
 .gitmodules                    |  4 +++
 agent/AgentGPT                 |  1 +
 3 files changed, 71 insertions(+)
 create mode 100644 .github/workflows/agentgpt.yml
 create mode 160000 agent/AgentGPT

diff --git a/.github/workflows/agentgpt.yml b/.github/workflows/agentgpt.yml
new file mode 100644
index 000000000..8c9b42203
--- /dev/null
+++ b/.github/workflows/agentgpt.yml
@@ -0,0 +1,66 @@
+name: AgentGPT Regression Test
+
+on:
+  workflow_dispatch:
+    branches: [master]
+  push:
+    branches: [stable, master, ci-test*]
+
+jobs:
+  regression-tests:
+    permissions:
+      pull-requests: write
+      contents: write
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ['3.10']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref }}
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - id: get_date
+        name: Get date
+        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python -
+
+      - name: Set up Poetry cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.cache/pypoetry
+            .venv
+          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
+
+      - name: Set up venv and install Python dependencies
+        run: |
+          poetry install --only main
+          poetry build
+
+      - name: Run regression tests
+        run: |
+          cd agent/AgentGPT
+          cd next
+          npm install
+          ../
+          cp .env_example .env
+          docker-compose up -d --build
+          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
+          agbenchmark start --maintain
+        env:
+          REWORKD_PLATFORM_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.gitmodules b/.gitmodules
index f14b5e07d..ee535b8b2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -18,3 +18,7 @@
 	path = agent/SuperAGI
 	url = https://github.com/SilenNaihin/SuperAGI.git
 	branch = benchmark-integration
+[submodule "agent/AgentGPT"]
+	path = agent/AgentGPT
+	url = https://github.com/SilenNaihin/AgentGPT.git
+	branch = benchmark-integration
diff --git a/agent/AgentGPT b/agent/AgentGPT
new file mode 160000
index 000000000..8e09b20b2
--- /dev/null
+++ b/agent/AgentGPT
@@ -0,0 +1 @@
+Subproject commit 8e09b20b2a38f06a38ab6afd16a00ffe2ed514c7
-- 
cgit v1.2.3


From 2d05c3ec5600e173d288f6714b3c3fc5e0087ae2 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 8 Jul 2023 12:50:39 -0400
Subject: reverting accidental previous changes

---
 .github/workflows/agentgpt.yml | 66 ------------------------------------------
 .gitmodules                    |  4 ---
 agent/AgentGPT                 |  2 +-
 agent/SuperAGI                 |  1 -
 4 files changed, 1 insertion(+), 72 deletions(-)
 delete mode 100644 .github/workflows/agentgpt.yml
 delete mode 160000 agent/SuperAGI

diff --git a/.github/workflows/agentgpt.yml b/.github/workflows/agentgpt.yml
deleted file mode 100644
index 8c9b42203..000000000
--- a/.github/workflows/agentgpt.yml
+++ /dev/null
@@ -1,66 +0,0 @@
-name: AgentGPT Regression Test
-
-on:
-  workflow_dispatch:
-    branches: [master]
-  push:
-    branches: [stable, master, ci-test*]
-
-jobs:
-  regression-tests:
-    permissions:
-      pull-requests: write
-      contents: write
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          submodules: true
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Install Poetry
-        run: |
-          curl -sSL https://install.python-poetry.org | python -
-
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Set up venv and install Python dependencies
-        run: |
-          poetry install --only main
-          poetry build
-
-      - name: Run regression tests
-        run: |
-          cd agent/AgentGPT
-          cd next
-          npm install
-          ../
-          cp .env_example .env
-          docker-compose up -d --build
-          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --maintain
-        env:
-          REWORKD_PLATFORM_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.gitmodules b/.gitmodules
index ee535b8b2..f14b5e07d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -18,7 +18,3 @@
 	path = agent/SuperAGI
 	url = https://github.com/SilenNaihin/SuperAGI.git
 	branch = benchmark-integration
-[submodule "agent/AgentGPT"]
-	path = agent/AgentGPT
-	url = https://github.com/SilenNaihin/AgentGPT.git
-	branch = benchmark-integration
diff --git a/agent/AgentGPT b/agent/AgentGPT
index 8e09b20b2..b92ddf858 160000
--- a/agent/AgentGPT
+++ b/agent/AgentGPT
@@ -1 +1 @@
-Subproject commit 8e09b20b2a38f06a38ab6afd16a00ffe2ed514c7
+Subproject commit b92ddf858529eddb6f17d85875767094f7ea2bfe
diff --git a/agent/SuperAGI b/agent/SuperAGI
deleted file mode 160000
index 7ab2994d4..000000000
--- a/agent/SuperAGI
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916
-- 
cgit v1.2.3


From db86ccdcb4217c5c8ab909f7628a00827ab52c42 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 8 Jul 2023 13:02:47 -0400
Subject: removing agentgpt

---
 agent/AgentGPT | 1 -
 agent/SuperAGI | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 160000 agent/AgentGPT
 create mode 160000 agent/SuperAGI

diff --git a/agent/AgentGPT b/agent/AgentGPT
deleted file mode 160000
index b92ddf858..000000000
--- a/agent/AgentGPT
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b92ddf858529eddb6f17d85875767094f7ea2bfe
diff --git a/agent/SuperAGI b/agent/SuperAGI
new file mode 160000
index 000000000..7ab2994d4
--- /dev/null
+++ b/agent/SuperAGI
@@ -0,0 +1 @@
+Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916
-- 
cgit v1.2.3


From 69bd41f7414c1028e61affd3a340054355d9249a Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 8 Jul 2023 21:43:38 -0400
Subject: Quality of life improvements & fixes (#75)

---
 .gitignore                                         |  2 +-
 agbenchmark/README.md                              |  5 +-
 agbenchmark/RegressionManager.py                   | 29 ++++++++++
 agbenchmark/agent_interface.py                     | 21 ++-----
 agbenchmark/challenge.py                           | 39 ++++++-------
 agbenchmark/challenges/README.md                   |  7 +--
 agbenchmark/challenges/code/code.py                |  8 ---
 agbenchmark/challenges/code/d1/data.json           |  7 +--
 .../d1/debug_simple_typo_with_guidance_test.py     | 18 +-----
 agbenchmark/challenges/code/d2/d2_test.py          | 20 +------
 agbenchmark/challenges/code/d2/data.json           |  9 +--
 agbenchmark/challenges/define_task_types.py        |  7 ---
 agbenchmark/challenges/interface/browse_test.py    |  0
 .../read_file/artifacts_in/file_to_check.txt       |  1 +
 .../read_file/artifacts_out/file_to_check.txt      |  1 +
 .../challenges/interface/read_file/data.json       | 17 ++++++
 .../interface/read_file/read_file_test.py          | 12 ++++
 .../write_file/artifacts_out/random_file.txt       |  1 +
 .../challenges/interface/write_file/data.json      | 18 ++++++
 .../interface/write_file/write_file_test.py        | 13 +++++
 agbenchmark/challenges/memory/m1/data.json         |  3 +-
 agbenchmark/challenges/memory/m1/m1_test.py        | 18 +-----
 agbenchmark/challenges/memory/m2/data.json         |  7 +--
 .../memory/m2/remember_multiple_ids_test.py        | 20 +------
 agbenchmark/challenges/memory/m3/data.json         |  7 +--
 .../m3/remember_multiple_ids_with_noise_test.py    | 21 +------
 agbenchmark/challenges/memory/m4/data.json         |  7 +--
 .../remember_multiple_phrases_with_noise_test.py   | 21 +------
 agbenchmark/challenges/memory/memory.py            |  8 ---
 agbenchmark/challenges/retrieval/r1/data.json      |  7 +--
 agbenchmark/challenges/retrieval/r1/r1_test.py     | 18 +-----
 agbenchmark/challenges/retrieval/r2/data.json      |  7 +--
 agbenchmark/challenges/retrieval/r2/r2_test.py     | 18 +-----
 agbenchmark/challenges/retrieval/r3/data.json      | 25 +++++---
 agbenchmark/challenges/retrieval/r3/r3_test.py     | 17 +-----
 agbenchmark/challenges/retrieval/retrieval.py      |  8 ---
 agbenchmark/conftest.py                            | 66 +++++++++++++++++-----
 agbenchmark/mocks/mock_manager.py                  | 28 ---------
 agbenchmark/mocks/tests/basic_mocks.py             | 12 ----
 agbenchmark/start_benchmark.py                     | 48 ++++++++--------
 .../tests/basic_abilities/basic_challenge.py       |  8 ---
 agbenchmark/tests/basic_abilities/browse_test.py   |  0
 .../read_file/artifacts_in/file_to_check.txt       |  1 -
 .../read_file/artifacts_out/file_to_check.txt      |  1 -
 .../tests/basic_abilities/read_file/data.json      | 20 -------
 .../basic_abilities/read_file/read_file_test.py    | 24 --------
 .../tests/basic_abilities/remember_context_test.py |  0
 .../write_file/artifacts_out/random_file.txt       |  1 -
 .../tests/basic_abilities/write_file/data.json     | 22 --------
 .../basic_abilities/write_file/write_file_test.py  | 25 --------
 agbenchmark/tests/regression/RegressionManager.py  | 29 ----------
 agent/Auto-GPT                                     |  2 +-
 agent/gpt-engineer                                 |  2 +-
 agent/smol-developer                               |  2 +-
 config.json                                        |  6 +-
 pyproject.toml                                     |  2 +-
 regression_tests.json                              | 54 +++++++++---------
 57 files changed, 279 insertions(+), 521 deletions(-)
 create mode 100644 agbenchmark/RegressionManager.py
 delete mode 100644 agbenchmark/challenges/code/code.py
 create mode 100644 agbenchmark/challenges/interface/browse_test.py
 create mode 100644 agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt
 create mode 100644 agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt
 create mode 100644 agbenchmark/challenges/interface/read_file/data.json
 create mode 100644 agbenchmark/challenges/interface/read_file/read_file_test.py
 create mode 100644 agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/interface/write_file/data.json
 create mode 100644 agbenchmark/challenges/interface/write_file/write_file_test.py
 delete mode 100644 agbenchmark/challenges/memory/memory.py
 delete mode 100644 agbenchmark/challenges/retrieval/retrieval.py
 delete mode 100644 agbenchmark/mocks/mock_manager.py
 delete mode 100644 agbenchmark/mocks/tests/basic_mocks.py
 delete mode 100644 agbenchmark/tests/basic_abilities/basic_challenge.py
 delete mode 100644 agbenchmark/tests/basic_abilities/browse_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file/data.json
 delete mode 100644 agbenchmark/tests/basic_abilities/read_file/read_file_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/remember_context_test.py
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file/data.json
 delete mode 100644 agbenchmark/tests/basic_abilities/write_file/write_file_test.py
 delete mode 100644 agbenchmark/tests/regression/RegressionManager.py

diff --git a/.gitignore b/.gitignore
index 3581dc933..7d0419ca4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-agbenchmark/mocks/workspace/
+agbenchmark/workspace/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/agbenchmark/README.md b/agbenchmark/README.md
index 01f602dc6..42e2bd4dd 100644
--- a/agbenchmark/README.md
+++ b/agbenchmark/README.md
@@ -53,8 +53,7 @@ import os
 class TestWriteFile(BasicChallenge):
     """Testing if LLM can write to a file"""
 
-    @pytest.mark.depends(on=[], name="basic_write_file")
-    def test_method(self, workspace):
+    def test_method(self, config):
         # implement scoring logic by looking at workspace
 ```
 
@@ -82,7 +81,7 @@ Add the below to create a file in the workspace prior to running a challenge. On
 
 ## Workspace
 
-If `--mock` flag is used it is at `agbenchmark/mocks/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
+If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
 
 #### Dataset
 
diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py
new file mode 100644
index 000000000..a1379ecae
--- /dev/null
+++ b/agbenchmark/RegressionManager.py
@@ -0,0 +1,29 @@
+import json
+
+
+class RegressionManager:
+    """Abstracts interaction with the regression tests file"""
+
+    def __init__(self, filename: str):
+        self.filename = filename
+        self.load()
+
+    def load(self) -> None:
+        try:
+            with open(self.filename, "r") as f:
+                self.tests = json.load(f)
+        except (FileNotFoundError, json.decoder.JSONDecodeError):
+            self.tests = {}
+
+    def save(self) -> None:
+        with open(self.filename, "w") as f:
+            json.dump(self.tests, f, indent=4)
+
+    def add_test(self, test_name: str, test_details: dict) -> None:
+        self.tests[test_name] = test_details
+        self.save()
+
+    def remove_test(self, test_name: str) -> None:
+        if test_name in self.tests:
+            del self.tests[test_name]
+            self.save()
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 4244fa082..1d43577c7 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -3,37 +3,27 @@ import shutil
 import subprocess
 import sys
 import time
-from typing import Any, Dict, Optional
+from typing import Any, Dict
 
 from dotenv import load_dotenv
 
-from agbenchmark.mocks.mock_manager import MockManager
-
 load_dotenv()
 
-MOCK_FLAG = os.getenv("MOCK_TEST")
+mock_test_str = os.getenv("MOCK_TEST")
+MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
 
 
 def run_agent(
     task: str,
-    mock_func: Optional[str],
     config: Dict[str, Any],
     challenge_location: str,
 ) -> None:
     """Calling to get a response"""
 
-    if MOCK_FLAG == "True":
+    if MOCK_FLAG:
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
-        if mock_func is None:
-            print("No mock provided")
-            return
-        mock_manager = MockManager(
-            task, config
-        )  # workspace doesn't need to be passed in, stays the same
-        print("Server unavailable, using mock", mock_func)
-        mock_manager.delegate(mock_func)
     else:
         timeout = config["cutoff"]
         print(
@@ -99,6 +89,3 @@ def copy_artifacts_into_workspace(
         full_file_name = os.path.join(source_dir, file_name)
         if os.path.isfile(full_file_name):
             shutil.copy(full_file_name, workspace)
-
-
-ENVIRONMENT = os.getenv("ENVIRONMENT") or "production"
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index d7e1c8965..ddf69f42d 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -4,9 +4,8 @@ import os
 import subprocess
 import types
 from abc import ABC, ABCMeta
-from typing import Any, Dict, List, Optional, Tuple, Type, cast
+from typing import Any, Dict, List, Tuple, Type, cast
 
-import pytest
 from dotenv import load_dotenv
 
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
@@ -19,7 +18,6 @@ MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
 
 class ChallengeMeta(ABCMeta):
     def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
-
         super().__init__(name, bases, dct)
         try:
             frame = cast(types.FrameType, inspect.currentframe())
@@ -40,18 +38,13 @@ class Challenge(ABC, metaclass=ChallengeMeta):
     @property
     def data(self) -> ChallengeData:
         file_path = f"{self.CHALLENGE_LOCATION}/data.json"
-        Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
+        if file_path not in Challenge._data_cache:
+            Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
         return Challenge._data_cache[file_path]
 
-    @property
-    def mock(self) -> Optional[str]:
-        return self.data.mock.mock_func if self.data.mock else None
-
     @property
     def task(self) -> str:
-        return str(
-            self.data.mock.mock_task if self.data.mock and MOCK_TEST else self.data.task
-        )
+        return self.data.task
 
     @property
     def dependencies(self) -> list:
@@ -64,17 +57,8 @@ class Challenge(ABC, metaclass=ChallengeMeta):
             config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
         )
 
-        run_agent(self.task, self.mock, config, self.__class__.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
 
-    @property
-    def name(self) -> str:
-        return self.data.name
-
-    @pytest.mark.parametrize(
-        "challenge_data",
-        [data],
-        indirect=True,
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         raise NotImplementedError
 
@@ -151,3 +135,16 @@ class Challenge(ABC, metaclass=ChallengeMeta):
                     )
 
         return 1.0
+
+    def get_scores(self, config: Dict[str, Any]) -> List[float]:
+        files_contents = self.get_artifacts_out(
+            config["workspace"], self.data.ground.files
+        )
+
+        scores = []
+        for file_content in files_contents:
+            score = self.scoring(file_content, self.data.ground)
+            print("Your score is:", score)
+            scores.append(score)
+
+        return scores
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 2d782d1fc..305cd28f1 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -25,10 +25,9 @@ Example:
 
 ```python
 {
-  "name": "basic_write_file",
   "category": ["basic"],
   "task": "Print the the capital of America to a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"], # the class name of the test
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
@@ -36,10 +35,6 @@ Example:
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests the writing to file",
diff --git a/agbenchmark/challenges/code/code.py b/agbenchmark/challenges/code/code.py
deleted file mode 100644
index 508d24a90..000000000
--- a/agbenchmark/challenges/code/code.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.code
-class CodeChallenge(Challenge):
-    """Challenge for memory"""
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
index c29c3d83a..6ac284b81 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "debug_simple_typo_with_guidance",
   "category": ["code"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,10 +9,6 @@
     "files": ["test.py"],
     "type": "execute_python_code"
   },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to debug python code with a simple typo in it.",
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
index 16a12ae41..d104b3374 100644
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.code.code import CodeChallenge
 
-
-class TestDebugSimpleTypoWithGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithGuidance(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(name="test_debug_simple_typo_with_guidance")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py
index 7a5988b94..b02114a75 100644
--- a/agbenchmark/challenges/code/d2/d2_test.py
+++ b/agbenchmark/challenges/code/d2/d2_test.py
@@ -1,28 +1,14 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.code.code import CodeChallenge
 
-
-class TestDebugSimpleTypoWithoutGuidance(CodeChallenge):
+class TestDebugSimpleTypoWithoutGuidance(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_debug_simple_typo_without_guidance",
-        depends=["test_debug_simple_typo_with_guidance"],
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+        scores = self.get_scores(config)
 
         assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
index 6003055a8..3de5111f5 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "debug_simple_typo_without_guidance",
   "category": ["code"],
   "task": "Make test.py run without errors.",
-  "dependencies": [],
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
@@ -10,12 +9,8 @@
     "files": ["test.py"],
     "type": "execute_python_code"
   },
-  "mock": {
-    "mock_func": null,
-    "mock_task": null
-  },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "medium",
     "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index f84df1262..308cb5ea6 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -4,11 +4,6 @@ from typing import List, Optional
 from pydantic import BaseModel
 
 
-class Mock(BaseModel):
-    mock_func: Optional[str] = None
-    mock_task: Optional[str] = None
-
-
 class Info(BaseModel):
     difficulty: str
     description: str
@@ -24,12 +19,10 @@ class Ground(BaseModel):
 
 
 class ChallengeData(BaseModel):
-    name: str
     category: List[str]
     task: str
     dependencies: List[str]
     ground: Ground
-    mock: Optional[Mock] = None
     info: Info
 
     def serialize(self, path: str) -> None:
diff --git a/agbenchmark/challenges/interface/browse_test.py b/agbenchmark/challenges/interface/browse_test.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt b/agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt
new file mode 100644
index 000000000..980a0d5f1
--- /dev/null
+++ b/agbenchmark/challenges/interface/read_file/artifacts_in/file_to_check.txt
@@ -0,0 +1 @@
+Hello World!
diff --git a/agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt b/agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt
new file mode 100644
index 000000000..c1a7879a1
--- /dev/null
+++ b/agbenchmark/challenges/interface/read_file/artifacts_out/file_to_check.txt
@@ -0,0 +1 @@
+random string Hello World!
diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json
new file mode 100644
index 000000000..dd399fabf
--- /dev/null
+++ b/agbenchmark/challenges/interface/read_file/data.json
@@ -0,0 +1,17 @@
+{
+  "name": "ReadFile",
+  "category": ["interface"],
+  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+  "dependencies": ["TestWriteFile"],
+  "ground": {
+    "answer": "random string Hello World!",
+    "should_contain": ["random string", "Hello World!"],
+    "files": ["file_to_check.txt"],
+    "type": "file"
+  },
+  "info": {
+    "description": "This reads the file quickly",
+    "difficulty": "basic",
+    "side_effects": [""]
+  }
+}
diff --git a/agbenchmark/challenges/interface/read_file/read_file_test.py b/agbenchmark/challenges/interface/read_file/read_file_test.py
new file mode 100644
index 000000000..591d0a744
--- /dev/null
+++ b/agbenchmark/challenges/interface/read_file/read_file_test.py
@@ -0,0 +1,12 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestReadFile(Challenge):
+    """Testing if LLM can read a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+        scores = self.get_scores(config)
+        assert 1 in scores
diff --git a/agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt b/agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt
new file mode 100644
index 000000000..1f275fb98
--- /dev/null
+++ b/agbenchmark/challenges/interface/write_file/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+Washington
diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json
new file mode 100644
index 000000000..b3e4b6f02
--- /dev/null
+++ b/agbenchmark/challenges/interface/write_file/data.json
@@ -0,0 +1,18 @@
+{
+  "name": "WriteFile",
+  "category": ["interface"],
+  "task": "Print the the capital of America to a .txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "Washington",
+    "should_contain": ["Washington"],
+    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests the writing to file",
+    "side_effects": ["tests if there is in fact an LLM attached"]
+  }
+}
diff --git a/agbenchmark/challenges/interface/write_file/write_file_test.py b/agbenchmark/challenges/interface/write_file/write_file_test.py
new file mode 100644
index 000000000..4a52b0979
--- /dev/null
+++ b/agbenchmark/challenges/interface/write_file/write_file_test.py
@@ -0,0 +1,13 @@
+from typing import Any, Dict
+
+from agbenchmark.challenge import Challenge
+
+
+class TestWriteFile(Challenge):
+    """Testing if LLM can write to a file"""
+
+    def test_method(self, config: Dict[str, Any]) -> None:
+        self.setup_challenge(config)
+
+        scores = self.get_scores(config)
+        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
index c7d441903..f771a2669 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_memory",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {
     "answer": "2314",
     "should_contain": ["2314"],
diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py
index 9e5e0a775..0fc537eeb 100644
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ b/agbenchmark/challenges/memory/m1/m1_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.memory.memory import MemoryChallenge
 
-
-class TestBasicMemory(MemoryChallenge):
+class TestBasicMemory(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(name="test_basic_memory")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
index 6e898298a..998e894b1 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_ids",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestBasicMemory"],
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
index 6ba38dad3..c88f28831 100644
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
@@ -1,27 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.memory.memory import MemoryChallenge
 
-
-class TestRememberMultipleIds(MemoryChallenge):
+class TestRememberMultipleIds(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_remember_multiple_ids", depends=["test_basic_memory"]
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
index 2a4f06ff7..d5d95b1de 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_ids_with_noise_mock",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestRememberMultipleIds"],
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "remember_multiple_ids_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "medium",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
index 037a6929e..0e35dd2f4 100644
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
@@ -1,28 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.memory.memory import MemoryChallenge
 
-
-class TestRememberMultipleIdsWithNoise(MemoryChallenge):
+class TestRememberMultipleIdsWithNoise(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_remember_multiple_ids_with_noise",
-        depends=["test_remember_multiple_ids"],
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
index adfd8e33f..49831537e 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "remember_multiple_phrases_with_noise_mock",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": [],
+  "dependencies": ["TestRememberMultipleIdsWithNoise"],
   "ground": {
     "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
     "should_contain": [
@@ -15,10 +14,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "remember_multiple_phrases_with_noise_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "medium",
     "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
index 2c931af8c..4c4bdce55 100644
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
@@ -1,28 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.memory.memory import MemoryChallenge
 
-
-class TestRememberMultiplePhrasesWithNoise(MemoryChallenge):
+class TestRememberMultiplePhrasesWithNoise(Challenge):
     """The first memory challenge"""
 
-    @pytest.mark.depends(
-        name="test_remember_multiple_phrases_with_noise",
-        depends=["test_remember_multiple_ids_with_noise"],
-    )
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/memory/memory.py b/agbenchmark/challenges/memory/memory.py
deleted file mode 100644
index 429bef23a..000000000
--- a/agbenchmark/challenges/memory/memory.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.memory
-class MemoryChallenge(Challenge):
-    """Challenge for memory"""
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
index 8fca01b78..6e1344b8b 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_information_retrieval",
   "category": ["retrieval"],
   "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"],
   "ground": {
     "answer": "£25.89",
     "should_contain": ["25.89"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_retrieval_mock",
-    "mock_task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file."
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information from a website.",
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
index 68d3de4e3..9845a7b2a 100644
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ b/agbenchmark/challenges/retrieval/r1/r1_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 
-
-class TestRetrieval(RetrievalChallenge):
+class TestRetrieval(Challenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.depends(name="test_retrieval")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
index 3c388f192..05846b9f3 100644
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -1,8 +1,7 @@
 {
-  "name": "basic_information_retrieval",
   "category": ["retrieval"],
   "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
+  "dependencies": ["TestRetrieval"],
   "ground": {
     "answer": "81,462",
     "should_contain": ["81,462"],
@@ -10,10 +9,6 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_retrieval_2_mock",
-    "mock_task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information.",
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
index 5a1a20690..f0f13ffbf 100644
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ b/agbenchmark/challenges/retrieval/r2/r2_test.py
@@ -1,25 +1,13 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 
-
-class TestRetrieval2(RetrievalChallenge):
+class TestRetrieval2(Challenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.depends(on=["test_retrieval"], name="test_retrieval_2")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
+        scores = self.get_scores(config)
         assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index 415456155..763c963ec 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -1,19 +1,30 @@
 {
-  "name": "basic_information_retrieval",
   "category": ["retrieval"],
   "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": [],
+  "dependencies": ["TestRetrieval2"],
   "ground": {
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
-    "should_contain": ["15", "112", "117", "204", "413", "2,014", "3,198", "4,046", "7,000", "11,759", "21,461", "24,578", "31,536", "53,823", "81,462"],
+    "should_contain": [
+      "15",
+      "112",
+      "117",
+      "204",
+      "413",
+      "2,014",
+      "3,198",
+      "4,046",
+      "7,000",
+      "11,759",
+      "21,461",
+      "24,578",
+      "31,536",
+      "53,823",
+      "81,462"
+    ],
     "should_not_contain": [],
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_retrieval_3_mock",
-    "mock_task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion)."
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information.",
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
index c4b4bcf12..5887c0b43 100644
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ b/agbenchmark/challenges/retrieval/r3/r3_test.py
@@ -1,25 +1,14 @@
 from typing import Any, Dict
 
-import pytest
+from agbenchmark.challenge import Challenge
 
-from agbenchmark.challenges.retrieval.retrieval import RetrievalChallenge
 
-
-class TestRetrieval3(RetrievalChallenge):
+class TestRetrieval3(Challenge):
     """The first information-retrieval challenge"""
 
-    @pytest.mark.depends(on=["test_retrieval_2"], name="test_retrieval_3")
     def test_method(self, config: Dict[str, Any]) -> None:
         self.setup_challenge(config)
 
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
+        scores = self.get_scores(config)
 
         assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/retrieval.py b/agbenchmark/challenges/retrieval/retrieval.py
deleted file mode 100644
index 891cccef7..000000000
--- a/agbenchmark/challenges/retrieval/retrieval.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.retrieval
-class RetrievalChallenge(Challenge):
-    """Challenge for information-retrieval"""
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 40457fb67..ffbb26202 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -2,12 +2,16 @@ import json
 import os
 import shutil
 from pathlib import Path  # noqa
-from typing import Any, Dict, Generator, List
+from typing import Any, Dict, Generator
 
 import pytest
 
-from agbenchmark.start_benchmark import CONFIG_PATH, REGRESSION_TESTS_PATH
-from agbenchmark.tests.regression.RegressionManager import RegressionManager
+from agbenchmark.RegressionManager import RegressionManager
+from agbenchmark.start_benchmark import (
+    CONFIG_PATH,
+    REGRESSION_TESTS_PATH,
+    get_regression_data,
+)
 
 
 def resolve_workspace(config: Dict[str, Any]) -> str:
@@ -37,7 +41,7 @@ def config(request: Any) -> None:
         config = json.load(f)
 
     if request.config.getoption("--mock"):
-        config["workspace"] = "agbenchmark/mocks/workspace"
+        config["workspace"] = "agbenchmark/workspace"
     elif isinstance(config["workspace"], str):
         config["workspace"] = resolve_workspace(config)
     else:  # it's a input output dict
@@ -77,9 +81,22 @@ def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
 
 def pytest_addoption(parser: Any) -> None:
     parser.addoption("--mock", action="store_true", default=False)
+    parser.addoption("--improve", action="store_true", default=False)
+    parser.addoption("--maintain", action="store_true", default=False)
 
 
-regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+@pytest.fixture(autouse=True)
+def check_regression(request: Any) -> None:
+    test_name = request.node.parent.name
+    data = get_regression_data()
+
+    # Check if the test name exists in the regression tests
+    if request.config.getoption("--improve") and data.get(test_name, None):
+        pytest.skip("Skipping test because it's a regression test and --improve is set")
+    elif request.config.getoption("--maintain") and not data.get(test_name, None):
+        pytest.skip(
+            "Skipping test because it's not a regression test and --maintain is set"
+        )
 
 
 # this is to get the challenge_data from every test
@@ -88,6 +105,9 @@ def challenge_data(request: Any) -> None:
     return request.param
 
 
+regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+
+
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
     if call.when == "call":
         challenge_data = item.funcargs.get("challenge_data", None)
@@ -109,16 +129,6 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
             regression_manager.remove_test(item.nodeid.split("::")[1])
 
 
-def pytest_collection_modifyitems(items: List[Any]) -> None:
-    """Called once all test items are collected. Used
-    to add regression and depends markers to collected test items."""
-    for item in items:
-        # regression add
-        if item.nodeid.split("::")[1] in regression_manager.tests:
-            print(regression_manager.tests)
-            item.add_marker(pytest.mark.regression)
-
-
 def pytest_sessionfinish() -> None:
     """Called at the end of the session to save regression tests"""
     regression_manager.save()
@@ -135,3 +145,29 @@ def pytest_generate_tests(metafunc: Any) -> None:
 
         # Add the parameters to the test function
         metafunc.parametrize("challenge_data", [params], indirect=True)
+
+
+# this is adding the dependency marker and category markers automatically from the json
+def pytest_collection_modifyitems(items: Any, config: Any) -> None:
+    data = get_regression_data()
+
+    for item in items:
+        # Assuming item.cls is your test class
+        test_class_instance = item.cls()
+
+        # Then you can access your properties
+        name = item.parent.cls.__name__
+        dependencies = test_class_instance.data.dependencies
+
+        # Filter dependencies if they exist in regression data if its an improvement test
+        if config.getoption("--improve"):
+            dependencies = [dep for dep in dependencies if not data.get(dep, None)]
+
+        categories = test_class_instance.data.category
+
+        # Add depends marker dynamically
+        item.add_marker(pytest.mark.depends(on=dependencies, name=name))
+
+        # Add category marker dynamically
+        for category in categories:
+            item.add_marker(getattr(pytest.mark, category))
diff --git a/agbenchmark/mocks/mock_manager.py b/agbenchmark/mocks/mock_manager.py
deleted file mode 100644
index 3a227e49b..000000000
--- a/agbenchmark/mocks/mock_manager.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from typing import Any, Dict, Optional
-
-import agbenchmark.mocks.tests.basic_mocks as basic_mocks
-
-
-class MockManager:
-    def __init__(self, task: Optional[str], config: Dict[str, Any]) -> None:
-        self.task = task
-        self.workspace = config["workspace"]
-        self.modules = [basic_mocks]
-
-    def delegate(self, mock_function_name: Any, *args: Any, **kwargs: Any) -> None:
-        if hasattr(self, mock_function_name):
-            # Check if the mock function is an attribute of this class
-            getattr(self, mock_function_name)(*args, **kwargs)
-        elif mock_function_name in globals():
-            # Check if the function is imported in the file
-            func = globals()[mock_function_name]
-            func(self.task, self.workspace, *args, **kwargs)
-        elif len(self.modules) > 0:
-            # checks if function is in imported modules
-            for module in self.modules:
-                if hasattr(module, mock_function_name):
-                    func = getattr(module, mock_function_name)
-                    func(self.task, self.workspace, *args, **kwargs)
-                    return
-        else:
-            raise ValueError(f"No such mock: {mock_function_name}")
diff --git a/agbenchmark/mocks/tests/basic_mocks.py b/agbenchmark/mocks/tests/basic_mocks.py
deleted file mode 100644
index e4a1dedc0..000000000
--- a/agbenchmark/mocks/tests/basic_mocks.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from agbenchmark.challenge import Challenge
-
-
-def example_mock(task: str, workspace: str) -> None:
-    """
-    This mock writes to a file (creates one if it doesn't exist)
-    """
-    Challenge.write_to_file(
-        workspace,
-        "file_to_check.txt",
-        "This is an example showing how you can use mocks but here you can use artifacts_out folder instead of a mock.",
-    )
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 295bbf4bf..f78e86a1c 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -2,11 +2,11 @@ import json
 import os
 import sys
 from pathlib import Path
-from typing import List
+from typing import Any
 
 import click
 import pytest
-from dotenv import load_dotenv, set_key
+from dotenv import load_dotenv
 
 load_dotenv()
 
@@ -26,10 +26,17 @@ def cli() -> None:
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
 @click.option("--maintain", is_flag=True, help="Runs only regression tests")
+@click.option("--improve", is_flag=True, help="Run only non-regression tests")
 @click.option("--mock", is_flag=True, help="Run with mock")
-def start(category: str, maintain: bool, mock: bool) -> int:
+def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
+    if maintain and improve:
+        print(
+            "Error: You can't use both --maintain and --improve at the same time. Please choose one."
+        )
+        return 1
+
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
         config = {}
 
@@ -55,7 +62,7 @@ def start(category: str, maintain: bool, mock: bool) -> int:
         with open(CONFIG_PATH, "r") as f:
             config = json.load(f)
 
-    set_key(".env", "MOCK_TEST", "True" if mock else "False")
+    os.environ["MOCK_TEST"] = "True" if mock else "False"
 
     if not os.path.exists(REGRESSION_TESTS_PATH):
         with open(REGRESSION_TESTS_PATH, "a"):
@@ -65,42 +72,31 @@ def start(category: str, maintain: bool, mock: bool) -> int:
     for key, value in config.items():
         print(f"{key}: {value}")
 
-    print("Starting benchmark tests...", category)
-    tests_to_run = []
     pytest_args = ["-vs"]
     if category:
         pytest_args.extend(["-m", category])
+        print("Starting benchmark tests ", category)
     else:
-        if maintain:
-            print("Running all regression tests")
-            tests_to_run = get_regression_tests()
-        else:
-            print("Running all categories")
+        print("Running all categories")
+
+    if maintain:
+        print("Running only regression tests")
+        pytest_args.append("--maintain")
+    elif improve:
+        print("Running only non-regression tests")
+        pytest_args.append("--improve")
 
     if mock:
         pytest_args.append("--mock")
 
-    # Run pytest with the constructed arguments
-    if not tests_to_run:
-        tests_to_run = [str(CURRENT_DIRECTORY)]
-    pytest_args.extend(tests_to_run)
-
     return sys.exit(pytest.main(pytest_args))
 
 
-def get_regression_tests() -> List[str]:
-    if not Path(REGRESSION_TESTS_PATH).exists():
-        with open(REGRESSION_TESTS_PATH, "w") as file:
-            json.dump({}, file)
-
+def get_regression_data() -> Any:
     with open(REGRESSION_TESTS_PATH, "r") as file:
         data = json.load(file)
 
-    regression_tests = [
-        str(CURRENT_DIRECTORY / ".." / value["test"]) for key, value in data.items()
-    ]
-
-    return regression_tests
+    return data
 
 
 if __name__ == "__main__":
diff --git a/agbenchmark/tests/basic_abilities/basic_challenge.py b/agbenchmark/tests/basic_abilities/basic_challenge.py
deleted file mode 100644
index 8b3a4db1d..000000000
--- a/agbenchmark/tests/basic_abilities/basic_challenge.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pytest
-
-from agbenchmark.challenge import Challenge
-
-
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
diff --git a/agbenchmark/tests/basic_abilities/browse_test.py b/agbenchmark/tests/basic_abilities/browse_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
deleted file mode 100644
index 980a0d5f1..000000000
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts_in/file_to_check.txt
+++ /dev/null
@@ -1 +0,0 @@
-Hello World!
diff --git a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt b/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
deleted file mode 100644
index c1a7879a1..000000000
--- a/agbenchmark/tests/basic_abilities/read_file/artifacts_out/file_to_check.txt
+++ /dev/null
@@ -1 +0,0 @@
-random string Hello World!
diff --git a/agbenchmark/tests/basic_abilities/read_file/data.json b/agbenchmark/tests/basic_abilities/read_file/data.json
deleted file mode 100644
index 7463d22fc..000000000
--- a/agbenchmark/tests/basic_abilities/read_file/data.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-  "name": "basic_read_file",
-  "category": ["basic"],
-  "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-  "dependencies": ["basic_write_file"],
-  "ground": {
-    "answer": "random string Hello World!",
-    "should_contain": ["random string", "Hello World!"],
-    "files": ["file_to_check.txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_read_file_mock"
-  },
-  "info": {
-    "description": "This reads the file quickly",
-    "difficulty": "basic",
-    "side_effects": [""]
-  }
-}
diff --git a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py b/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
deleted file mode 100644
index cf5dceb69..000000000
--- a/agbenchmark/tests/basic_abilities/read_file/read_file_test.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-
-from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
-
-
-class TestReadFile(BasicChallenge):
-    """Testing if LLM can read a file"""
-
-    @pytest.mark.depends(on=["basic_write_file"], name="basic_read_file")
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
-        assert 1 in scores
diff --git a/agbenchmark/tests/basic_abilities/remember_context_test.py b/agbenchmark/tests/basic_abilities/remember_context_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt b/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
deleted file mode 100644
index 1f275fb98..000000000
--- a/agbenchmark/tests/basic_abilities/write_file/artifacts_out/random_file.txt
+++ /dev/null
@@ -1 +0,0 @@
-Washington
diff --git a/agbenchmark/tests/basic_abilities/write_file/data.json b/agbenchmark/tests/basic_abilities/write_file/data.json
deleted file mode 100644
index 9232a45a0..000000000
--- a/agbenchmark/tests/basic_abilities/write_file/data.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "name": "basic_write_file",
-  "category": ["basic"],
-  "task": "Print the the capital of America to a .txt file",
-  "dependencies": [],
-  "ground": {
-    "answer": "Washington",
-    "should_contain": ["Washington"],
-    "should_not_contain": ["New York", "Los Angeles", "San Francisco"],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "mock": {
-    "mock_func": "basic_write_file_mock",
-    "mock_task": "What is the capital of America?"
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests the writing to file",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py b/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
deleted file mode 100644
index ba0395186..000000000
--- a/agbenchmark/tests/basic_abilities/write_file/write_file_test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-
-from agbenchmark.tests.basic_abilities.basic_challenge import BasicChallenge
-
-
-class TestWriteFile(BasicChallenge):
-    """Testing if LLM can write to a file"""
-
-    @pytest.mark.depends(name="basic_write_file")
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        files_contents = self.get_artifacts_out(
-            config["workspace"], self.data.ground.files
-        )
-
-        scores = []
-        for file_content in files_contents:
-            score = self.scoring(file_content, self.data.ground)
-            print("Your score is:", score)
-            scores.append(score)
-
-        assert 1 in scores
diff --git a/agbenchmark/tests/regression/RegressionManager.py b/agbenchmark/tests/regression/RegressionManager.py
deleted file mode 100644
index a1379ecae..000000000
--- a/agbenchmark/tests/regression/RegressionManager.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import json
-
-
-class RegressionManager:
-    """Abstracts interaction with the regression tests file"""
-
-    def __init__(self, filename: str):
-        self.filename = filename
-        self.load()
-
-    def load(self) -> None:
-        try:
-            with open(self.filename, "r") as f:
-                self.tests = json.load(f)
-        except (FileNotFoundError, json.decoder.JSONDecodeError):
-            self.tests = {}
-
-    def save(self) -> None:
-        with open(self.filename, "w") as f:
-            json.dump(self.tests, f, indent=4)
-
-    def add_test(self, test_name: str, test_details: dict) -> None:
-        self.tests[test_name] = test_details
-        self.save()
-
-    def remove_test(self, test_name: str) -> None:
-        if test_name in self.tests:
-            del self.tests[test_name]
-            self.save()
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index dd65cc256..5a36e43b7 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit dd65cc256ca72cb199fe8c5d6ae31c23a7acee62
+Subproject commit 5a36e43b782fdaef8a7270109f8347f0323211d2
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 155ea895e..fd705f89a 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 155ea895eb5f7e44ed8647b335d90a03b5ffb06d
+Subproject commit fd705f89afd53469c91935a9cae7b92a564025eb
diff --git a/agent/smol-developer b/agent/smol-developer
index 5a3ad4310..a1e4a9ff3 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit 5a3ad43103b238b9c8f2a2acceff250888be263e
+Subproject commit a1e4a9ff3a75909c4a892e409a55f86a2c57b7c6
diff --git a/config.json b/config.json
index 88526a134..8bbcebdbd 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
-  "workspace": "projects/my-new-project/workspace",
-  "entry_path": "agent/gpt-engineer/benchmarks.py",
-  "home_path": "agent/gpt-engineer",
+  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+  "entry_path": "benchmarks.py",
+  "home_path": "agent/mini-agi",
   "cutoff": 60
 }
diff --git a/pyproject.toml b/pyproject.toml
index 33a8671cf..a8f4f8dee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,7 +37,7 @@ testpaths = [
 markers = [
     "retrieval",
     "regression",
-    "basic",
+    "interface",
     "code",
     "memory"
 ]
diff --git a/regression_tests.json b/regression_tests.json
index 9714d42a8..44334801e 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,14 +1,4 @@
 {
-    "TestDebugSimpleTypoWithGuidance": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py"
-    },
-    "TestDebugSimpleTypoWithoutGuidance": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d2/d2_test.py"
-    },
     "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [],
@@ -16,44 +6,54 @@
     },
     "TestRememberMultipleIds": {
         "difficulty": "basic",
-        "dependencies": [],
+        "dependencies": [
+            "TestBasicMemory"
+        ],
         "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
     },
     "TestRememberMultipleIdsWithNoise": {
         "difficulty": "medium",
-        "dependencies": [],
+        "dependencies": [
+            "TestRememberMultipleIds"
+        ],
         "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
     },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIdsWithNoise"
+        ],
+        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+    },
     "TestRetrieval": {
         "difficulty": "basic",
         "dependencies": [],
         "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
     },
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/tests/basic_abilities/write_file/write_file_test.py"
-    },
     "TestRetrieval2": {
         "difficulty": "basic",
-        "dependencies": [],
+        "dependencies": [
+            "TestRetrieval"
+        ],
         "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
     },
-    "TestReadFile": {
+    "TestRetrieval3": {
         "difficulty": "basic",
         "dependencies": [
-            "basic_write_file"
+            "TestRetrieval2"
         ],
-        "test": "agbenchmark/tests/basic_abilities/read_file/read_file_test.py"
+        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
     },
-    "TestRetrieval3": {
+    "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
+        "test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
     },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From d89264998d36251d8c471942da05b557fa26689d Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sat, 8 Jul 2023 18:46:37 -0700
Subject: Fix debug code challenge (#76)

Co-authored-by: Silen Naihin <silen.naihin@gmail.com>
---
 agbenchmark/conftest.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index ffbb26202..7d3dd8ed3 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -40,9 +40,7 @@ def config(request: Any) -> None:
     with open(CONFIG_PATH, "r") as f:
         config = json.load(f)
 
-    if request.config.getoption("--mock"):
-        config["workspace"] = "agbenchmark/workspace"
-    elif isinstance(config["workspace"], str):
+    if isinstance(config["workspace"], str):
         config["workspace"] = resolve_workspace(config)
     else:  # it's a input output dict
         config["workspace"]["input"] = resolve_workspace(config)
-- 
cgit v1.2.3


From 573130549fec6fe86194dec6cd9a2257dc5c5eec Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 9 Jul 2023 13:31:31 -0700
Subject: Add gpt engineer to ci (#78)

---
 .github/workflows/gpt-engineer.yml | 24 +++++++++++++++++++++---
 agbenchmark/start_benchmark.py     |  3 +++
 agent/Auto-GPT                     |  2 +-
 agent/gpt-engineer                 |  2 +-
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml
index a39165482..833026e8e 100644
--- a/.github/workflows/gpt-engineer.yml
+++ b/.github/workflows/gpt-engineer.yml
@@ -5,6 +5,8 @@ on:
     branches: [master]
   push:
     branches: [stable, master, ci-test*]
+  pull_request:
+    branches: [stable, master, ci-test*]
 
 jobs:
   regression-tests:
@@ -52,14 +54,30 @@ jobs:
           poetry install --only main
           poetry build
 
-      - name: Run regression tests
+      - name: Run regression tests (push)
+        if: ${{ github.event_name != 'pull_request' }}
         run: |
           cd agent/gpt-engineer
           make install
           source venv/bin/activate
-          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --maintain
+          pip install ../../dist/*.whl
+          
+          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ]; then
+            agbenchmark start --maintain
+          else
+            agbenchmark start --maintain --mock
+            agbenchmark start --improve --mock
+            agbenchmark start --mock
+            agbenchmark start --mock --category=retrieval
+            agbenchmark start --mock --category=regression
+            agbenchmark start --mock --category=interface
+            agbenchmark start --mock --category=code
+            agbenchmark start --mock --category=memory
+            agbenchmark start --mock --category=memory --category=code
+          fi
+
         env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
 
       - name: Upload logs as artifact
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index f78e86a1c..68c7932be 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -89,6 +89,9 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
     if mock:
         pytest_args.append("--mock")
 
+    # when used as a library, the pytest directory to execute is in the CURRENT_DIRECTORY
+    pytest_args.append(str(CURRENT_DIRECTORY))
+
     return sys.exit(pytest.main(pytest_args))
 
 
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index 5a36e43b7..cec424ad2 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit 5a36e43b782fdaef8a7270109f8347f0323211d2
+Subproject commit cec424ad2504020a830c3af9f74536a420545931
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index fd705f89a..4af8c137e 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit fd705f89afd53469c91935a9cae7b92a564025eb
+Subproject commit 4af8c137e82cc51fdd31c23327ceffd64194b984
-- 
cgit v1.2.3


From 3d43117554034a634f1c39018c6af6c69ed16fc9 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 9 Jul 2023 20:27:21 -0400
Subject: Just json, no test files (#77)

---
 agbenchmark/RegressionManager.py                   | 13 +++-
 agbenchmark/challenge.py                           | 26 ++------
 agbenchmark/challenges/code/d1/data.json           |  1 +
 .../d1/debug_simple_typo_with_guidance_test.py     | 13 ----
 agbenchmark/challenges/code/d2/d2_test.py          | 14 ----
 agbenchmark/challenges/code/d2/data.json           |  1 +
 agbenchmark/challenges/define_task_types.py        |  1 +
 .../challenges/interface/read_file/data.json       |  2 +-
 .../interface/read_file/read_file_test.py          | 12 ----
 .../challenges/interface/write_file/data.json      |  2 +-
 .../interface/write_file/write_file_test.py        | 13 ----
 agbenchmark/challenges/memory/m1/data.json         |  1 +
 agbenchmark/challenges/memory/m1/m1_test.py        | 13 ----
 agbenchmark/challenges/memory/m2/data.json         |  1 +
 .../memory/m2/remember_multiple_ids_test.py        | 13 ----
 agbenchmark/challenges/memory/m3/data.json         |  1 +
 .../m3/remember_multiple_ids_with_noise_test.py    | 13 ----
 agbenchmark/challenges/memory/m4/data.json         |  1 +
 .../remember_multiple_phrases_with_noise_test.py   | 13 ----
 agbenchmark/challenges/retrieval/r1/data.json      |  1 +
 agbenchmark/challenges/retrieval/r1/r1_test.py     | 13 ----
 agbenchmark/challenges/retrieval/r2/data.json      |  3 +-
 agbenchmark/challenges/retrieval/r2/r2_test.py     | 13 ----
 agbenchmark/challenges/retrieval/r3/data.json      |  1 +
 agbenchmark/challenges/retrieval/r3/r3_test.py     | 14 ----
 agbenchmark/challenges/test_all.py                 | 78 ++++++++++++++++++++++
 agbenchmark/conftest.py                            | 40 +++++------
 regression_tests.json                              | 61 +++++++++--------
 28 files changed, 158 insertions(+), 220 deletions(-)
 delete mode 100644 agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
 delete mode 100644 agbenchmark/challenges/code/d2/d2_test.py
 delete mode 100644 agbenchmark/challenges/interface/read_file/read_file_test.py
 delete mode 100644 agbenchmark/challenges/interface/write_file/write_file_test.py
 delete mode 100644 agbenchmark/challenges/memory/m1/m1_test.py
 delete mode 100644 agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
 delete mode 100644 agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
 delete mode 100644 agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
 delete mode 100644 agbenchmark/challenges/retrieval/r1/r1_test.py
 delete mode 100644 agbenchmark/challenges/retrieval/r2/r2_test.py
 delete mode 100644 agbenchmark/challenges/retrieval/r3/r3_test.py
 create mode 100644 agbenchmark/challenges/test_all.py

diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py
index a1379ecae..e289a4787 100644
--- a/agbenchmark/RegressionManager.py
+++ b/agbenchmark/RegressionManager.py
@@ -11,9 +11,18 @@ class RegressionManager:
     def load(self) -> None:
         try:
             with open(self.filename, "r") as f:
-                self.tests = json.load(f)
-        except (FileNotFoundError, json.decoder.JSONDecodeError):
+                file_content = (
+                    f.read().strip()
+                )  # read the content and remove any leading/trailing whitespace
+                if file_content:  # if file is not empty, load the json
+                    self.tests = json.loads(file_content)
+                else:  # if file is empty, assign an empty dictionary
+                    self.tests = {}
+        except FileNotFoundError:
             self.tests = {}
+        except json.decoder.JSONDecodeError:  # If JSON is invalid
+            self.tests = {}
+        self.save()
 
     def save(self) -> None:
         with open(self.filename, "w") as f:
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index ddf69f42d..cf7ce104c 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -1,10 +1,8 @@
 import glob
-import inspect
 import os
 import subprocess
-import types
-from abc import ABC, ABCMeta
-from typing import Any, Dict, List, Tuple, Type, cast
+from abc import ABC
+from typing import Any, Dict, List
 
 from dotenv import load_dotenv
 
@@ -16,24 +14,12 @@ mock_test_str = os.getenv("MOCK_TEST")
 MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
 
 
-class ChallengeMeta(ABCMeta):
-    def __init__(self, name: str, bases: Tuple[Type, ...], dct: Dict[str, Any]) -> None:
-        super().__init__(name, bases, dct)
-        try:
-            frame = cast(types.FrameType, inspect.currentframe())
-            assert frame.f_back is not None
-            self.CHALLENGE_LOCATION = os.path.dirname(inspect.getfile(frame.f_back))
-        except Exception as e:
-            print(f"Unable to get the file from 8 frames back due to: {str(e)}")
-            raise e
-
-
-class Challenge(ABC, metaclass=ChallengeMeta):
+class Challenge(ABC):
     """The parent class to all specific challenges classes.
     Defines helper methods for running a challenge"""
 
     _data_cache: Dict[str, ChallengeData] = {}
-    CHALLENGE_LOCATION: str
+    CHALLENGE_LOCATION: str = ""
 
     @property
     def data(self) -> ChallengeData:
@@ -54,10 +40,10 @@ class Challenge(ABC, metaclass=ChallengeMeta):
         from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
 
         copy_artifacts_into_workspace(
-            config["workspace"], "artifacts_in", self.__class__.CHALLENGE_LOCATION
+            config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
         )
 
-        run_agent(self.task, config, self.__class__.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.CHALLENGE_LOCATION)
 
     def test_method(self, config: Dict[str, Any]) -> None:
         raise NotImplementedError
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
index 6ac284b81..0c7246000 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestDebugSimpleTypoWithGuidance",
   "category": ["code"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": ["TestReadFile", "TestWriteFile"],
diff --git a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py b/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
deleted file mode 100644
index d104b3374..000000000
--- a/agbenchmark/challenges/code/d1/debug_simple_typo_with_guidance_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestDebugSimpleTypoWithGuidance(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/d2_test.py b/agbenchmark/challenges/code/d2/d2_test.py
deleted file mode 100644
index b02114a75..000000000
--- a/agbenchmark/challenges/code/d2/d2_test.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestDebugSimpleTypoWithoutGuidance(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-
-        assert 1 in scores
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
index 3de5111f5..292301094 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestDebugSimpleTypoWithoutGuidance",
   "category": ["code"],
   "task": "Make test.py run without errors.",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 308cb5ea6..94cba5b72 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -19,6 +19,7 @@ class Ground(BaseModel):
 
 
 class ChallengeData(BaseModel):
+    name: str
     category: List[str]
     task: str
     dependencies: List[str]
diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json
index dd399fabf..c827581b6 100644
--- a/agbenchmark/challenges/interface/read_file/data.json
+++ b/agbenchmark/challenges/interface/read_file/data.json
@@ -1,5 +1,5 @@
 {
-  "name": "ReadFile",
+  "name": "TestReadFile",
   "category": ["interface"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
   "dependencies": ["TestWriteFile"],
diff --git a/agbenchmark/challenges/interface/read_file/read_file_test.py b/agbenchmark/challenges/interface/read_file/read_file_test.py
deleted file mode 100644
index 591d0a744..000000000
--- a/agbenchmark/challenges/interface/read_file/read_file_test.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestReadFile(Challenge):
-    """Testing if LLM can read a file"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json
index b3e4b6f02..2be2d0dfe 100644
--- a/agbenchmark/challenges/interface/write_file/data.json
+++ b/agbenchmark/challenges/interface/write_file/data.json
@@ -1,5 +1,5 @@
 {
-  "name": "WriteFile",
+  "name": "TestWriteFile",
   "category": ["interface"],
   "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
diff --git a/agbenchmark/challenges/interface/write_file/write_file_test.py b/agbenchmark/challenges/interface/write_file/write_file_test.py
deleted file mode 100644
index 4a52b0979..000000000
--- a/agbenchmark/challenges/interface/write_file/write_file_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestWriteFile(Challenge):
-    """Testing if LLM can write to a file"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
index f771a2669..506b246ad 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestBasicMemory",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestReadFile", "TestWriteFile"],
diff --git a/agbenchmark/challenges/memory/m1/m1_test.py b/agbenchmark/challenges/memory/m1/m1_test.py
deleted file mode 100644
index 0fc537eeb..000000000
--- a/agbenchmark/challenges/memory/m1/m1_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestBasicMemory(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
index 998e894b1..7ef2552d1 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestRememberMultipleIds",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestBasicMemory"],
diff --git a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py b/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
deleted file mode 100644
index c88f28831..000000000
--- a/agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRememberMultipleIds(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
index d5d95b1de..720cce93c 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestRememberMultipleIdsWithNoise",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestRememberMultipleIds"],
diff --git a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py b/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
deleted file mode 100644
index 0e35dd2f4..000000000
--- a/agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRememberMultipleIdsWithNoise(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
index 49831537e..61965206b 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestRememberMultiplePhrasesWithNoise",
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestRememberMultipleIdsWithNoise"],
diff --git a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py b/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
deleted file mode 100644
index 4c4bdce55..000000000
--- a/agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRememberMultiplePhrasesWithNoise(Challenge):
-    """The first memory challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
index 6e1344b8b..7812c21da 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestBasicRetrieval",
   "category": ["retrieval"],
   "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
   "dependencies": ["TestWriteFile"],
diff --git a/agbenchmark/challenges/retrieval/r1/r1_test.py b/agbenchmark/challenges/retrieval/r1/r1_test.py
deleted file mode 100644
index 9845a7b2a..000000000
--- a/agbenchmark/challenges/retrieval/r1/r1_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRetrieval(Challenge):
-    """The first information-retrieval challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
index 05846b9f3..5bc2e96b4 100644
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -1,7 +1,8 @@
 {
+  "name": "TestRetrieval2",
   "category": ["retrieval"],
   "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": ["TestRetrieval"],
+  "dependencies": ["TestBasicRetrieval"],
   "ground": {
     "answer": "81,462",
     "should_contain": ["81,462"],
diff --git a/agbenchmark/challenges/retrieval/r2/r2_test.py b/agbenchmark/challenges/retrieval/r2/r2_test.py
deleted file mode 100644
index f0f13ffbf..000000000
--- a/agbenchmark/challenges/retrieval/r2/r2_test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRetrieval2(Challenge):
-    """The first information-retrieval challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-        assert 1 in scores
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index 763c963ec..b918d3d4e 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -1,4 +1,5 @@
 {
+  "name": "TestRetrieval3",
   "category": ["retrieval"],
   "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
   "dependencies": ["TestRetrieval2"],
diff --git a/agbenchmark/challenges/retrieval/r3/r3_test.py b/agbenchmark/challenges/retrieval/r3/r3_test.py
deleted file mode 100644
index 5887c0b43..000000000
--- a/agbenchmark/challenges/retrieval/r3/r3_test.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from typing import Any, Dict
-
-from agbenchmark.challenge import Challenge
-
-
-class TestRetrieval3(Challenge):
-    """The first information-retrieval challenge"""
-
-    def test_method(self, config: Dict[str, Any]) -> None:
-        self.setup_challenge(config)
-
-        scores = self.get_scores(config)
-
-        assert 1 in scores
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
new file mode 100644
index 000000000..4f9e5b7f8
--- /dev/null
+++ b/agbenchmark/challenges/test_all.py
@@ -0,0 +1,78 @@
+import glob
+import importlib
+import json
+import os
+import types
+from pathlib import Path
+from typing import Any, Dict
+
+import pytest
+from dotenv import load_dotenv
+
+from agbenchmark.challenge import Challenge
+
+load_dotenv()
+
+IMPROVE = os.getenv("IMPROVE", "False")
+
+
+json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True)
+
+
+def get_test_path(json_file: str) -> str:
+    abs_location = os.path.dirname(os.path.abspath(json_file))
+
+    path = Path(abs_location)
+
+    # Find the index of "agbenchmark" in the path parts
+    try:
+        agbenchmark_index = path.parts.index("agbenchmark")
+    except ValueError:
+        raise ValueError("Invalid challenge location.")
+
+    # Create the path from "agbenchmark" onwards
+    challenge_location = Path(*path.parts[agbenchmark_index:])
+
+    return str(challenge_location)
+
+
+def generate_tests() -> None:
+    print("Generating tests...")
+    # Dynamic class creation
+    for json_file in json_files:
+        with open(json_file, "r") as f:
+            data = json.load(f)
+
+            class_name = data.get("name", "")
+
+        challenge_location = get_test_path(json_file)
+
+        # Define test class dynamically
+        challenge_class = types.new_class(class_name, (Challenge,))
+
+        setattr(challenge_class, "CHALLENGE_LOCATION", challenge_location)
+
+        # Define test method within the dynamically created class
+        def test_method(self, config: Dict[str, Any]) -> None:  # type: ignore
+            self.setup_challenge(config)
+
+            scores = self.get_scores(config)
+            assert 1 in scores
+
+        # Parametrize the method here
+        test_method = pytest.mark.parametrize(
+            "challenge_data",
+            [data],
+            indirect=True,
+        )(test_method)
+
+        setattr(challenge_class, "test_method", test_method)
+
+        # Attach the new class to a module so it can be discovered by pytest
+        module = importlib.import_module(__name__)
+        setattr(module, class_name, challenge_class)
+
+        print(f"Generated test for {class_name}.")
+
+
+generate_tests()
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 7d3dd8ed3..e321f5a26 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -88,13 +88,16 @@ def check_regression(request: Any) -> None:
     test_name = request.node.parent.name
     data = get_regression_data()
 
+    # Get the true location of the test
+    challenge_location = getattr(request.node.parent.cls, "CHALLENGE_LOCATION", "")
+
+    skip_string = f"Skipping {test_name} at {challenge_location}"
+
     # Check if the test name exists in the regression tests
     if request.config.getoption("--improve") and data.get(test_name, None):
-        pytest.skip("Skipping test because it's a regression test and --improve is set")
+        pytest.skip(f"{skip_string} because it's a regression test")
     elif request.config.getoption("--maintain") and not data.get(test_name, None):
-        pytest.skip(
-            "Skipping test because it's not a regression test and --maintain is set"
-        )
+        pytest.skip(f"{skip_string} because it's not a regression test")
 
 
 # this is to get the challenge_data from every test
@@ -109,15 +112,19 @@ regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
     if call.when == "call":
         challenge_data = item.funcargs.get("challenge_data", None)
-        difficulty = challenge_data.info.difficulty if challenge_data else "unknown"
-        dependencies = challenge_data.dependencies if challenge_data else []
-        parts = item.nodeid.split("::")[0].split("/")
-        agbenchmark_index = parts.index("agbenchmark")
-        file_path = "/".join(parts[agbenchmark_index:])
+        difficulty = (
+            challenge_data["info"]["difficulty"] if challenge_data else "unknown"
+        )
+        dependencies = dependencies = (
+            challenge_data["dependencies"] if challenge_data else []
+        )
+        # Extract the challenge_location from the class
+        challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
+
         test_details = {
             "difficulty": difficulty,
             "dependencies": dependencies,
-            "test": file_path,
+            "test": challenge_location,
         }
 
         print("pytest_runtest_makereport", test_details)
@@ -132,19 +139,6 @@ def pytest_sessionfinish() -> None:
     regression_manager.save()
 
 
-# this is so that all tests can inherit from the Challenge class
-def pytest_generate_tests(metafunc: Any) -> None:
-    if "challenge_data" in metafunc.fixturenames:
-        # Get the instance of the test class
-        test_class = metafunc.cls()
-
-        # Generate the parameters
-        params = test_class.data
-
-        # Add the parameters to the test function
-        metafunc.parametrize("challenge_data", [params], indirect=True)
-
-
 # this is adding the dependency marker and category markers automatically from the json
 def pytest_collection_modifyitems(items: Any, config: Any) -> None:
     data = get_regression_data()
diff --git a/regression_tests.json b/regression_tests.json
index 44334801e..613207917 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,59 +1,64 @@
 {
-    "TestBasicMemory": {
+    "TestWriteFile": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark/challenges/memory/m1/m1_test.py"
+        "test": "agbenchmark\\challenges\\interface\\write_file"
     },
-    "TestRememberMultipleIds": {
+    "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
-            "TestBasicMemory"
+            "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/memory/m2/remember_multiple_ids_test.py"
+        "test": "agbenchmark\\challenges\\interface\\read_file"
     },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
+    "TestBasicMemory": {
+        "difficulty": "basic",
         "dependencies": [
-            "TestRememberMultipleIds"
+            "TestReadFile",
+            "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/memory/m3/remember_multiple_ids_with_noise_test.py"
+        "test": "agbenchmark\\challenges\\memory\\m1"
     },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
+    "TestBasicRetrieval": {
+        "difficulty": "basic",
         "dependencies": [
-            "TestRememberMultipleIdsWithNoise"
+            "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/memory/m4/remember_multiple_phrases_with_noise_test.py"
+        "test": "agbenchmark\\challenges\\retrieval\\r1"
     },
-    "TestRetrieval": {
+    "TestRememberMultipleIds": {
         "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/retrieval/r1/r1_test.py"
+        "dependencies": [
+            "TestBasicMemory"
+        ],
+        "test": "agbenchmark\\challenges\\memory\\m2"
     },
     "TestRetrieval2": {
         "difficulty": "basic",
         "dependencies": [
-            "TestRetrieval"
+            "TestBasicRetrieval"
+        ],
+        "test": "agbenchmark\\challenges\\retrieval\\r2"
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIds"
         ],
-        "test": "agbenchmark/challenges/retrieval/r2/r2_test.py"
+        "test": "agbenchmark\\challenges\\memory\\m3"
     },
     "TestRetrieval3": {
         "difficulty": "basic",
         "dependencies": [
             "TestRetrieval2"
         ],
-        "test": "agbenchmark/challenges/retrieval/r3/r3_test.py"
+        "test": "agbenchmark\\challenges\\retrieval\\r3"
     },
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/write_file/write_file_test.py"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
         "dependencies": [
-            "TestWriteFile"
+            "TestRememberMultipleIdsWithNoise"
         ],
-        "test": "agbenchmark/challenges/interface/read_file/read_file_test.py"
+        "test": "agbenchmark\\challenges\\memory\\m4"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From 0fa5286ad0e06fc5089b7002a930f752227c2061 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 9 Jul 2023 18:06:26 -0700
Subject: Combine all agents into one ci.yml (#79)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/autogpt.yml        | 64 --------------------------
 .github/workflows/ci.yml             | 76 ++++++++++++++++++++++++++++++-
 .github/workflows/gpt-engineer.yml   | 88 ------------------------------------
 .github/workflows/mini-agi.yml       | 66 ---------------------------
 .github/workflows/smol-developer.yml | 64 --------------------------
 .github/workflows/superagi.yml       | 62 -------------------------
 6 files changed, 74 insertions(+), 346 deletions(-)
 delete mode 100644 .github/workflows/autogpt.yml
 delete mode 100644 .github/workflows/gpt-engineer.yml
 delete mode 100644 .github/workflows/mini-agi.yml
 delete mode 100644 .github/workflows/smol-developer.yml
 delete mode 100644 .github/workflows/superagi.yml

diff --git a/.github/workflows/autogpt.yml b/.github/workflows/autogpt.yml
deleted file mode 100644
index 2d7e2dfbd..000000000
--- a/.github/workflows/autogpt.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-name: Auto-GPT Regression Test
-
-on:
-  workflow_dispatch:
-    branches: [master]
-  push:
-    branches: [stable, master, ci-test*]
-
-jobs:
-  regression-tests:
-    permissions:
-      pull-requests: write
-      contents: write
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          submodules: true
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Install Poetry
-        run: |
-          curl -sSL https://install.python-poetry.org | python -
-
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Set up venv and install Python dependencies
-        run: |
-          poetry install --only main
-          poetry build
-
-      - name: Run regression tests
-        run: |
-          python -m venv venv
-          source venv/bin/activate
-          cd agent/Auto-GPT
-          pip install -r requirements.txt
-          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --maintain
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6a0f4503a..d989389db 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,6 +1,10 @@
-name: Python CI
+name: CI
 
 on:
+  workflow_dispatch:
+    branches: [ master ]
+  schedule:
+    - cron: '0 8 * * *'
   push:
     branches: [ master, ci-test* ]
   pull_request:
@@ -20,6 +24,7 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.ref }}
           repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
 
       - name: Set up Python ${{ env.min-python-version }}
         uses: actions/setup-python@v2
@@ -68,10 +73,18 @@ jobs:
         if: success() || failure()
 
   tests:
-
+    name: ${{ matrix.agent-name }}
     runs-on: ubuntu-latest
     env:
       min-python-version: "3.10"
+    strategy:
+      fail-fast: false
+      matrix:
+        agent-name:
+          - "gpt-engineer"
+          - "Auto-GPT"
+          - "mini-agi"
+          - "smol-developer"
 
     steps:
       - name: Checkout repository
@@ -80,6 +93,7 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.pull_request.head.ref }}
           repository: ${{ github.event.pull_request.head.repo.full_name }}
+          submodules: true
 
       - name: Set up Python ${{ env.min-python-version }}
         uses: actions/setup-python@v2
@@ -107,3 +121,61 @@ jobs:
           poetry install
           poetry run agbenchmark start --mock
           poetry run agbenchmark start --mock --maintain
+          poetry build
+
+      - name: Run regression tests
+        run: |
+          cd agent/$AGENT_NAME
+          if [ "$AGENT_NAME" == "gpt-engineer" ]; then
+            make install
+            source venv/bin/activate
+          elif [ "$AGENT_NAME" == "Auto-GPT" ]; then
+            python -m venv venv
+            source venv/bin/activate
+            pip install -r requirements.txt
+          elif [ "$AGENT_NAME" == "mini-agi" ]; then
+            python -m venv venv
+            source venv/bin/activate
+            pip install -r requirements.txt
+            cp .env_example .env
+          elif [ "$AGENT_NAME" == "smol-developer" ]; then
+            python -m venv venv
+            source venv/bin/activate
+            pip install -r requirements.txt
+          elif [ "$AGENT_NAME" == "SuperAGI" ]; then
+            cp config_template.yaml config.yaml
+            sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
+            docker-compose up -d --build
+          else
+            echo "Unknown agent name: $AGENT_NAME"
+            exit 1
+          fi
+          
+          pip install ../../dist/*.whl
+
+          if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then
+            agbenchmark start --maintain
+          else
+            exit 0
+            agbenchmark start --maintain --mock
+            agbenchmark start --improve --mock
+            agbenchmark start --mock
+            agbenchmark start --mock --category=retrieval
+            agbenchmark start --mock --category=regression
+            agbenchmark start --mock --category=interface
+            agbenchmark start --mock --category=code
+            agbenchmark start --mock --category=memory
+            agbenchmark start --mock --category=memory --category=code
+          fi
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          AGENT_NAME: ${{ matrix.agent-name }}
+          PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards.
+
+      - name: Upload logs as artifact
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: gpt-engineer-projects
+          path: agent/gpt-engineer/projects
diff --git a/.github/workflows/gpt-engineer.yml b/.github/workflows/gpt-engineer.yml
deleted file mode 100644
index 833026e8e..000000000
--- a/.github/workflows/gpt-engineer.yml
+++ /dev/null
@@ -1,88 +0,0 @@
-name: gpt-engineer Regression Test
-
-on:
-  workflow_dispatch:
-    branches: [master]
-  push:
-    branches: [stable, master, ci-test*]
-  pull_request:
-    branches: [stable, master, ci-test*]
-
-jobs:
-  regression-tests:
-    permissions:
-      pull-requests: write
-      contents: write
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          submodules: true
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Install Poetry
-        run: |
-          curl -sSL https://install.python-poetry.org | python -
-
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Set up venv and install Python dependencies
-        run: |
-          poetry install --only main
-          poetry build
-
-      - name: Run regression tests (push)
-        if: ${{ github.event_name != 'pull_request' }}
-        run: |
-          cd agent/gpt-engineer
-          make install
-          source venv/bin/activate
-          pip install ../../dist/*.whl
-          
-          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ]; then
-            agbenchmark start --maintain
-          else
-            agbenchmark start --maintain --mock
-            agbenchmark start --improve --mock
-            agbenchmark start --mock
-            agbenchmark start --mock --category=retrieval
-            agbenchmark start --mock --category=regression
-            agbenchmark start --mock --category=interface
-            agbenchmark start --mock --category=code
-            agbenchmark start --mock --category=memory
-            agbenchmark start --mock --category=memory --category=code
-          fi
-
-        env:
-          GITHUB_EVENT_NAME: ${{ github.event_name }}
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-
-      - name: Upload logs as artifact
-        if: always()
-        uses: actions/upload-artifact@v3
-        with:
-          name: gpt-engineer-projects
-          path: agent/gpt-engineer/projects
diff --git a/.github/workflows/mini-agi.yml b/.github/workflows/mini-agi.yml
deleted file mode 100644
index 53c479df4..000000000
--- a/.github/workflows/mini-agi.yml
+++ /dev/null
@@ -1,66 +0,0 @@
-name: mini-agi Regression Test
-
-on:
-  workflow_dispatch:
-    branches: [master]
-  push:
-    branches: [stable, master, ci-test*]
-
-jobs:
-  regression-tests:
-    permissions:
-      pull-requests: write
-      contents: write
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          submodules: true
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Install Poetry
-        run: |
-          curl -sSL https://install.python-poetry.org | python -
-
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Set up venv and install Python dependencies
-        run: |
-          poetry install --only main
-          poetry build
-
-      - name: Run regression tests
-        run: |
-          cd agent/mini-agi
-          python -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          cp .env_example .env
-          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --maintain
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          PROMPT_USER: false
diff --git a/.github/workflows/smol-developer.yml b/.github/workflows/smol-developer.yml
deleted file mode 100644
index 6926df54b..000000000
--- a/.github/workflows/smol-developer.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-name: smol developer Regression Test
-
-on:
-  workflow_dispatch:
-    branches: [master]
-  push:
-    branches: [stable, master, ci-test*]
-
-jobs:
-  regression-tests:
-    permissions:
-      pull-requests: write
-      contents: write
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          submodules: true
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Install Poetry
-        run: |
-          curl -sSL https://install.python-poetry.org | python -
-
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Set up venv and install Python dependencies
-        run: |
-          poetry install --only main
-          poetry build
-
-      - name: Run regression tests
-        run: |
-          cd agent/smol-developer
-          python -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --maintain
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/superagi.yml b/.github/workflows/superagi.yml
deleted file mode 100644
index 5ab52d33f..000000000
--- a/.github/workflows/superagi.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: SuperAgi Regression Test
-
-on:
-  workflow_dispatch:
-    branches: [master]
-  push:
-    branches: [stable, master, ci-test*]
-
-jobs:
-  regression-tests:
-    permissions:
-      pull-requests: write
-      contents: write
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10']
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.ref }}
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          submodules: true
-
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-
-      - id: get_date
-        name: Get date
-        run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
-
-      - name: Install Poetry
-        run: |
-          curl -sSL https://install.python-poetry.org | python -
-
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
-      - name: Set up venv and install Python dependencies
-        run: |
-          poetry install --only main
-          poetry build
-
-      - name: Run regression tests
-        run: |
-          cd agent/SuperAGI
-          cp config_template.yaml config.yaml
-          sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
-          docker-compose up -d --build
-          pip install ../../dist/agbenchmark-0.1.0-py3-none-any.whl
-          agbenchmark start --maintain
-- 
cgit v1.2.3


From b8830f86256ce54c990fc4bd4a0fe2ac7389cdbd Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 9 Jul 2023 21:33:08 -0400
Subject: Adding search interface challenge and cleaning repo (#80)

---
 agbenchmark/challenges/adaptability/a1_test.py         |  0
 agbenchmark/challenges/interface/browse_test.py        |  0
 .../interface/search/artifacts_out/random_file.txt     |  2 ++
 agbenchmark/challenges/interface/search/data.json      | 18 ++++++++++++++++++
 agbenchmark/challenges/retrieval/r1/data.json          |  2 +-
 agbenchmark/challenges/web_navigation/wn1_test.py      |  0
 agbenchmark/challenges/writing/w1_test.py              |  0
 regression_tests.json                                  |  5 +++++
 8 files changed, 26 insertions(+), 1 deletion(-)
 delete mode 100644 agbenchmark/challenges/adaptability/a1_test.py
 delete mode 100644 agbenchmark/challenges/interface/browse_test.py
 create mode 100644 agbenchmark/challenges/interface/search/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/interface/search/data.json
 delete mode 100644 agbenchmark/challenges/web_navigation/wn1_test.py
 delete mode 100644 agbenchmark/challenges/writing/w1_test.py

diff --git a/agbenchmark/challenges/adaptability/a1_test.py b/agbenchmark/challenges/adaptability/a1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/interface/browse_test.py b/agbenchmark/challenges/interface/browse_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/interface/search/artifacts_out/random_file.txt b/agbenchmark/challenges/interface/search/artifacts_out/random_file.txt
new file mode 100644
index 000000000..035667591
--- /dev/null
+++ b/agbenchmark/challenges/interface/search/artifacts_out/random_file.txt
@@ -0,0 +1,2 @@
+This is a Heading
+This is a paragraph.
\ No newline at end of file
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
new file mode 100644
index 000000000..17ee1ac1a
--- /dev/null
+++ b/agbenchmark/challenges/interface/search/data.json
@@ -0,0 +1,18 @@
+{
+  "name": "TestSearch",
+  "category": ["interface"],
+  "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+  "dependencies": [],
+  "ground": {
+    "answer": "This is a Heading\nThis is a paragraph.",
+    "should_contain": ["Heading", "paragraph"],
+    "should_not_contain": ["The", "the"],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests if an llm can search",
+    "side_effects": [""]
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
index 7812c21da..4f3833dfc 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -2,7 +2,7 @@
   "name": "TestBasicRetrieval",
   "category": ["retrieval"],
   "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
-  "dependencies": ["TestWriteFile"],
+  "dependencies": ["TestWriteFile", "TestSearch"],
   "ground": {
     "answer": "£25.89",
     "should_contain": ["25.89"],
diff --git a/agbenchmark/challenges/web_navigation/wn1_test.py b/agbenchmark/challenges/web_navigation/wn1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/writing/w1_test.py b/agbenchmark/challenges/writing/w1_test.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/regression_tests.json b/regression_tests.json
index 613207917..10a6e11bf 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -60,5 +60,10 @@
             "TestRememberMultipleIdsWithNoise"
         ],
         "test": "agbenchmark\\challenges\\memory\\m4"
+    },
+    "TestSearch": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark\\challenges\\interface\\search"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From 9adcad8b8aefd20ae62d0826f5c17394b352d09c Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Sun, 9 Jul 2023 19:32:04 -0700
Subject: Fix regression: restore api_base and organization configurability
 (#4933)

---
 autogpt/config/config.py       | 13 ++++++++++++-
 autogpt/llm/utils/__init__.py  | 18 ++++--------------
 autogpt/memory/vector/utils.py |  7 ++-----
 tests/unit/test_config.py      | 26 ++++++++++++++++++++------
 4 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/autogpt/config/config.py b/autogpt/config/config.py
index 05590eb6a..b1ff0a0ab 100644
--- a/autogpt/config/config.py
+++ b/autogpt/config/config.py
@@ -86,7 +86,18 @@ class Config(SystemSettings):
     plugins: list[str]
     authorise_key: str
 
-    def get_azure_kwargs(self, model: str) -> dict[str, str]:
+    def get_openai_credentials(self, model: str) -> dict[str, str]:
+        credentials = {
+            "api_key": self.openai_api_key,
+            "api_base": self.openai_api_base,
+            "organization": self.openai_organization,
+        }
+        if self.use_azure:
+            azure_credentials = self.get_azure_credentials(model)
+            credentials.update(azure_credentials)
+        return credentials
+
+    def get_azure_credentials(self, model: str) -> dict[str, str]:
         """Get the kwargs for the Azure API."""
 
         # Fix --gpt3only and --gpt4only in combination with Azure
diff --git a/autogpt/llm/utils/__init__.py b/autogpt/llm/utils/__init__.py
index 3c2835b7c..e0ff1473f 100644
--- a/autogpt/llm/utils/__init__.py
+++ b/autogpt/llm/utils/__init__.py
@@ -71,17 +71,14 @@ def create_text_completion(
     if temperature is None:
         temperature = config.temperature
 
-    if config.use_azure:
-        kwargs = config.get_azure_kwargs(model)
-    else:
-        kwargs = {"model": model}
+    kwargs = {"model": model}
+    kwargs.update(config.get_openai_credentials(model))
 
     response = iopenai.create_text_completion(
         prompt=prompt,
         **kwargs,
         temperature=temperature,
         max_tokens=max_output_tokens,
-        api_key=config.openai_api_key,
     )
     logger.debug(f"Response: {response}")
 
@@ -137,9 +134,7 @@ def create_chat_completion(
             if message is not None:
                 return message
 
-    chat_completion_kwargs["api_key"] = config.openai_api_key
-    if config.use_azure:
-        chat_completion_kwargs.update(config.get_azure_kwargs(model))
+    chat_completion_kwargs.update(config.get_openai_credentials(model))
 
     if functions:
         chat_completion_kwargs["functions"] = [
@@ -179,12 +174,7 @@ def check_model(
     config: Config,
 ) -> str:
     """Check if model is available for use. If not, return gpt-3.5-turbo."""
-    openai_credentials = {
-        "api_key": config.openai_api_key,
-    }
-    if config.use_azure:
-        openai_credentials.update(config.get_azure_kwargs(model_name))
-
+    openai_credentials = config.get_openai_credentials(model_name)
     api_manager = ApiManager()
     models = api_manager.get_models(**openai_credentials)
 
diff --git a/autogpt/memory/vector/utils.py b/autogpt/memory/vector/utils.py
index 74438f28c..eb6912566 100644
--- a/autogpt/memory/vector/utils.py
+++ b/autogpt/memory/vector/utils.py
@@ -41,10 +41,8 @@ def get_embedding(
         input = [text.replace("\n", " ") for text in input]
 
     model = config.embedding_model
-    if config.use_azure:
-        kwargs = config.get_azure_kwargs(model)
-    else:
-        kwargs = {"model": model}
+    kwargs = {"model": model}
+    kwargs.update(config.get_openai_credentials(model))
 
     logger.debug(
         f"Getting embedding{f's for {len(input)} inputs' if multiple else ''}"
@@ -57,7 +55,6 @@ def get_embedding(
     embeddings = iopenai.create_embedding(
         input,
         **kwargs,
-        api_key=config.openai_api_key,
     ).data
 
     if not multiple:
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index b441aa948..7abbfcd52 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -174,18 +174,32 @@ azure_model_map:
 
     fast_llm = config.fast_llm
     smart_llm = config.smart_llm
-    assert config.get_azure_kwargs(config.fast_llm)["deployment_id"] == "FAST-LLM_ID"
-    assert config.get_azure_kwargs(config.smart_llm)["deployment_id"] == "SMART-LLM_ID"
+    assert (
+        config.get_azure_credentials(config.fast_llm)["deployment_id"] == "FAST-LLM_ID"
+    )
+    assert (
+        config.get_azure_credentials(config.smart_llm)["deployment_id"]
+        == "SMART-LLM_ID"
+    )
 
     # Emulate --gpt4only
     config.fast_llm = smart_llm
-    assert config.get_azure_kwargs(config.fast_llm)["deployment_id"] == "SMART-LLM_ID"
-    assert config.get_azure_kwargs(config.smart_llm)["deployment_id"] == "SMART-LLM_ID"
+    assert (
+        config.get_azure_credentials(config.fast_llm)["deployment_id"] == "SMART-LLM_ID"
+    )
+    assert (
+        config.get_azure_credentials(config.smart_llm)["deployment_id"]
+        == "SMART-LLM_ID"
+    )
 
     # Emulate --gpt3only
     config.fast_llm = config.smart_llm = fast_llm
-    assert config.get_azure_kwargs(config.fast_llm)["deployment_id"] == "FAST-LLM_ID"
-    assert config.get_azure_kwargs(config.smart_llm)["deployment_id"] == "FAST-LLM_ID"
+    assert (
+        config.get_azure_credentials(config.fast_llm)["deployment_id"] == "FAST-LLM_ID"
+    )
+    assert (
+        config.get_azure_credentials(config.smart_llm)["deployment_id"] == "FAST-LLM_ID"
+    )
 
     del os.environ["USE_AZURE"]
     del os.environ["AZURE_CONFIG_FILE"]
-- 
cgit v1.2.3


From 4d514694738eb1a9a581136e85cb6aeb0ba27d63 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <github@pwuts.nl>
Date: Mon, 10 Jul 2023 18:13:59 +0200
Subject: Fix CI cassette checkout

---
 .github/workflows/ci.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3e21d1d70..dde98cf91 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -108,22 +108,27 @@ jobs:
         if: ${{ startsWith(github.event_name, 'pull_request') }}
         run: |
           cassette_branch="${{ github.event.pull_request.user.login }}-${{ github.event.pull_request.head.ref }}"
+          cassette_base_branch="${{ github.event.pull_request.base.ref }}"
           cd tests/Auto-GPT-test-cassettes
 
+          if ! git ls-remote --exit-code --heads origin $cassette_base_branch ; then
+            cassette_base_branch="master"
+          fi
+
           if git ls-remote --exit-code --heads origin $cassette_branch ; then
             git fetch origin $cassette_branch
-            git fetch origin ${{ github.event.pull_request.base.ref }}
+            git fetch origin $cassette_base_branch
 
             git checkout $cassette_branch
 
             # Pick non-conflicting cassette updates from the base branch
-            git merge --no-commit --strategy-option=ours origin/${{ github.event.pull_request.base.ref }}
+            git merge --no-commit --strategy-option=ours origin/$cassette_base_branch
             echo "Using cassettes from mirror branch '$cassette_branch'," \
-              "synced to upstream branch '${{ github.event.pull_request.base.ref }}'."
+              "synced to upstream branch '$cassette_base_branch'."
           else
             git checkout -b $cassette_branch
             echo "Branch '$cassette_branch' does not exist in cassette submodule." \
-              "Using cassettes from '${{ github.event.pull_request.base.ref }}'."
+              "Using cassettes from '$cassette_base_branch'."
           fi
 
       - name: Set up Python ${{ matrix.python-version }}
-- 
cgit v1.2.3


From 30ba51593f277711148da30f465417adb848472c Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Mon, 10 Jul 2023 09:19:12 -0700
Subject: Add Helicone (#81)

---
 .github/workflows/ci.yml           | 1 +
 agbenchmark/challenge.py           | 3 ++-
 agbenchmark/challenges/test_all.py | 3 ++-
 agent/Auto-GPT                     | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d989389db..cac1dedb1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -172,6 +172,7 @@ jobs:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           AGENT_NAME: ${{ matrix.agent-name }}
           PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards.
+          HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
 
       - name: Upload logs as artifact
         if: always()
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index cf7ce104c..aeebd7ad8 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -7,6 +7,7 @@ from typing import Any, Dict, List
 from dotenv import load_dotenv
 
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
+from agbenchmark.start_benchmark import CURRENT_DIRECTORY
 
 load_dotenv()
 
@@ -23,7 +24,7 @@ class Challenge(ABC):
 
     @property
     def data(self) -> ChallengeData:
-        file_path = f"{self.CHALLENGE_LOCATION}/data.json"
+        file_path = f"{CURRENT_DIRECTORY}/../{self.CHALLENGE_LOCATION}/data.json"
         if file_path not in Challenge._data_cache:
             Challenge._data_cache[file_path] = ChallengeData.deserialize(file_path)
         return Challenge._data_cache[file_path]
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index 4f9e5b7f8..e7fe99e73 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -10,13 +10,14 @@ import pytest
 from dotenv import load_dotenv
 
 from agbenchmark.challenge import Challenge
+from agbenchmark.start_benchmark import CURRENT_DIRECTORY
 
 load_dotenv()
 
 IMPROVE = os.getenv("IMPROVE", "False")
 
 
-json_files = glob.glob("agbenchmark/challenges/**/data.json", recursive=True)
+json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
 
 
 def get_test_path(json_file: str) -> str:
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index cec424ad2..f360d503b 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit cec424ad2504020a830c3af9f74536a420545931
+Subproject commit f360d503b113119f6b3ce0acff1dbb4dfae2223a
-- 
cgit v1.2.3


From 437e066a66c4f3d6aeba26f79fe1c3d8e4ea5743 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Mon, 10 Jul 2023 17:46:03 -0700
Subject: Add "Simple web server" challenge (#74)

Co-authored-by: Silen Naihin <silen.naihin@gmail.com>
---
 agbenchmark/RegressionManager.py                   | 15 ++++-
 .../challenges/code/d3/custom_python/api_tests.py  | 27 ++++++++
 agbenchmark/challenges/code/d3/data.json           | 18 ++++++
 agbenchmark/challenges/test_all.py                 | 25 +++++++-
 poetry.lock                                        | 31 ++++++++-
 pyproject.toml                                     |  1 +
 regression_tests.json                              | 73 ++++++++++++++--------
 7 files changed, 160 insertions(+), 30 deletions(-)
 create mode 100644 agbenchmark/challenges/code/d3/custom_python/api_tests.py
 create mode 100644 agbenchmark/challenges/code/d3/data.json

diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py
index e289a4787..ac9efc696 100644
--- a/agbenchmark/RegressionManager.py
+++ b/agbenchmark/RegressionManager.py
@@ -1,4 +1,5 @@
 import json
+from typing import Union
 
 
 class RegressionManager:
@@ -15,7 +16,9 @@ class RegressionManager:
                     f.read().strip()
                 )  # read the content and remove any leading/trailing whitespace
                 if file_content:  # if file is not empty, load the json
-                    self.tests = json.loads(file_content)
+                    data = json.loads(file_content)
+                    self.tests = {k: data[k] for k in sorted(data)}
+                    data = self.replace_backslash(data)
                 else:  # if file is empty, assign an empty dictionary
                     self.tests = {}
         except FileNotFoundError:
@@ -36,3 +39,13 @@ class RegressionManager:
         if test_name in self.tests:
             del self.tests[test_name]
             self.save()
+
+    def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
+        if isinstance(value, str):
+            return value.replace("\\\\", "/")  # escape \ with \\
+        elif isinstance(value, list):
+            return [self.replace_backslash(i) for i in value]
+        elif isinstance(value, dict):
+            return {k: self.replace_backslash(v) for k, v in value.items()}
+        else:
+            return value
diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d3/custom_python/api_tests.py
new file mode 100644
index 000000000..1d6255ebd
--- /dev/null
+++ b/agbenchmark/challenges/code/d3/custom_python/api_tests.py
@@ -0,0 +1,27 @@
+import os
+from typing import Any, Dict
+from unittest.mock import Mock, patch
+
+import requests
+
+
+def make_assertion() -> None:
+    if os.environ.get("MOCK_TEST", "False").lower() == "true":
+        mock_response = Mock(requests.Response)
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"status": "OK"}
+
+        with patch("requests.get", return_value=mock_response):
+            make_request_and_assert()
+    else:
+        make_request_and_assert()
+
+
+def make_request_and_assert() -> Dict[str, Any]:
+    response = requests.get("http://localhost:8079/health")
+    if response.status_code != 200:
+        raise AssertionError(
+            f"Expected status code 200, but got {response.status_code}"
+        )
+
+    return response.json()
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json
new file mode 100644
index 000000000..07d607f5f
--- /dev/null
+++ b/agbenchmark/challenges/code/d3/data.json
@@ -0,0 +1,18 @@
+{
+  "name": "TestCreateSimpleWebServer",
+  "category": ["code"],
+  "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "ground": {
+    "answer": "GET localhost:8079/health responds with a 200 OK",
+    "should_contain": [],
+    "should_not_contain": [],
+    "files": [],
+    "type": "custom_python"
+  },
+  "info": {
+    "difficulty": "medium",
+    "description": "Tests ability for the agent to build a simple web server locally",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index e7fe99e73..7dee0b2ab 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -2,6 +2,8 @@ import glob
 import importlib
 import json
 import os
+import pkgutil
+import sys
 import types
 from pathlib import Path
 from typing import Any, Dict
@@ -47,6 +49,19 @@ def generate_tests() -> None:
             class_name = data.get("name", "")
 
         challenge_location = get_test_path(json_file)
+        if data["ground"]["type"] == "custom_python":
+            custom_python_location = (
+                f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python"
+            )
+            sys.path.append(str(custom_python_location))
+
+            for (module_loader, name, ispkg) in pkgutil.iter_modules(
+                [str(custom_python_location)]
+            ):
+                module = importlib.import_module(name)
+
+                if hasattr(module, "make_assertion"):
+                    make_assertion = getattr(module, "make_assertion")
 
         # Define test class dynamically
         challenge_class = types.new_class(class_name, (Challenge,))
@@ -58,7 +73,15 @@ def generate_tests() -> None:
             self.setup_challenge(config)
 
             scores = self.get_scores(config)
-            assert 1 in scores
+
+            # Check if make_assertion is defined and use it
+            if "make_assertion" in locals():
+                try:
+                    make_assertion()
+                except AssertionError as error:
+                    print(error)  # Or handle this in another way
+            else:
+                assert 1 in scores
 
         # Parametrize the method here
         test_method = pytest.mark.parametrize(
diff --git a/poetry.lock b/poetry.lock
index 4eae340b6..5526da16b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -961,6 +961,33 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
+[[package]]
+name = "types-requests"
+version = "2.31.0.1"
+description = "Typing stubs for requests"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "types-requests-2.31.0.1.tar.gz", hash = "sha256:3de667cffa123ce698591de0ad7db034a5317457a596eb0b4944e5a9d9e8d1ac"},
+    {file = "types_requests-2.31.0.1-py3-none-any.whl", hash = "sha256:afb06ef8f25ba83d59a1d424bd7a5a939082f94b94e90ab5e6116bd2559deaa3"},
+]
+
+[package.dependencies]
+types-urllib3 = "*"
+
+[[package]]
+name = "types-urllib3"
+version = "1.26.25.13"
+description = "Typing stubs for urllib3"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "types-urllib3-1.26.25.13.tar.gz", hash = "sha256:3300538c9dc11dad32eae4827ac313f5d986b8b21494801f1bf97a1ac6c03ae5"},
+    {file = "types_urllib3-1.26.25.13-py3-none-any.whl", hash = "sha256:5dbd1d2bef14efee43f5318b5d36d805a489f6600252bb53626d4bfafd95e27c"},
+]
+
 [[package]]
 name = "typing-extensions"
 version = "4.7.1"
@@ -1082,4 +1109,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "44b5789494e73f3cb8bcb9d25daa62143e59352a246fd7724fdb3ad58c2560ae"
+content-hash = "81b84bbe08d4a09fb6a4f99c7fb018e0c0fcd879fa368c388b0af20c7c9a3f31"
diff --git a/pyproject.toml b/pyproject.toml
index a8f4f8dee..1a96a51de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ pydantic = "^1.10.9"
 pytest-depends = "^1.0.1"
 python-dotenv = "^0.21.0"
 click = "^8.1.3"
+types-requests = "^2.31.0.1"
 
 [tool.poetry.group.dev.dependencies]
 flake8 = "^3.9.2"
diff --git a/regression_tests.json b/regression_tests.json
index 10a6e11bf..0cf2d5f30 100644
--- a/regression_tests.json
+++ b/regression_tests.json
@@ -1,69 +1,90 @@
 {
-    "TestWriteFile": {
+    "TestBasicMemory": {
         "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark\\challenges\\interface\\write_file"
+        "dependencies": [
+            "TestReadFile",
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/memory/m1"
     },
-    "TestReadFile": {
+    "TestBasicRetrieval": {
         "difficulty": "basic",
         "dependencies": [
-            "TestWriteFile"
+            "TestWriteFile",
+            "TestSearch"
         ],
-        "test": "agbenchmark\\challenges\\interface\\read_file"
+        "test": "agbenchmark/challenges/retrieval/r1"
     },
-    "TestBasicMemory": {
+    "TestCreateSimpleWebServer": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/code/d3"
+    },
+    "TestDebugSimpleTypoWithGuidance": {
         "difficulty": "basic",
         "dependencies": [
             "TestReadFile",
             "TestWriteFile"
         ],
-        "test": "agbenchmark\\challenges\\memory\\m1"
+        "test": "agbenchmark/challenges/code/d1"
     },
-    "TestBasicRetrieval": {
+    "TestDebugSimpleTypoWithoutGuidance": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestDebugSimpleTypoWithGuidance"
+        ],
+        "test": "agbenchmark/challenges/code/d2"
+    },
+    "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
             "TestWriteFile"
         ],
-        "test": "agbenchmark\\challenges\\retrieval\\r1"
+        "test": "agbenchmark/challenges/interface/read_file"
     },
     "TestRememberMultipleIds": {
         "difficulty": "basic",
         "dependencies": [
             "TestBasicMemory"
         ],
-        "test": "agbenchmark\\challenges\\memory\\m2"
+        "test": "agbenchmark/challenges/memory/m2"
     },
-    "TestRetrieval2": {
-        "difficulty": "basic",
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
         "dependencies": [
-            "TestBasicRetrieval"
+            "TestRememberMultipleIds"
         ],
-        "test": "agbenchmark\\challenges\\retrieval\\r2"
+        "test": "agbenchmark/challenges/memory/m3"
     },
-    "TestRememberMultipleIdsWithNoise": {
+    "TestRememberMultiplePhrasesWithNoise": {
         "difficulty": "medium",
         "dependencies": [
-            "TestRememberMultipleIds"
+            "TestRememberMultipleIdsWithNoise"
         ],
-        "test": "agbenchmark\\challenges\\memory\\m3"
+        "test": "agbenchmark/challenges/memory/m4"
     },
-    "TestRetrieval3": {
+    "TestRetrieval2": {
         "difficulty": "basic",
         "dependencies": [
-            "TestRetrieval2"
+            "TestBasicRetrieval"
         ],
-        "test": "agbenchmark\\challenges\\retrieval\\r3"
+        "test": "agbenchmark/challenges/retrieval/r2"
     },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
+    "TestRetrieval3": {
+        "difficulty": "basic",
         "dependencies": [
-            "TestRememberMultipleIdsWithNoise"
+            "TestRetrieval2"
         ],
-        "test": "agbenchmark\\challenges\\memory\\m4"
+        "test": "agbenchmark/challenges/retrieval/r3"
     },
     "TestSearch": {
         "difficulty": "basic",
         "dependencies": [],
-        "test": "agbenchmark\\challenges\\interface\\search"
+        "test": "agbenchmark/challenges/interface/search"
+    },
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/interface/write_file"
     }
 }
\ No newline at end of file
-- 
cgit v1.2.3


From 8df82909b2938424d387cdaa817821adcbee1dac Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 10 Jul 2023 22:25:19 -0400
Subject: Added --test, consolidate files, reports working (#83)

---
 agbenchmark/RegressionManager.py                  |  51 ----------
 agbenchmark/ReportManager.py                      |  68 ++++++++++++++
 agbenchmark/agent_interface.py                    |  14 ++-
 agbenchmark/challenges/define_task_types.py       |   6 ++
 agbenchmark/challenges/interface/search/data.json |   2 +-
 agbenchmark/challenges/test_all.py                |   4 +-
 agbenchmark/config.json                           |   5 +
 agbenchmark/conftest.py                           |  19 +++-
 agbenchmark/regression_tests.json                 |  99 ++++++++++++++++++++
 agbenchmark/reports/1.json                        | 109 ++++++++++++++++++++++
 agbenchmark/start_benchmark.py                    |  59 ++++++++----
 agbenchmark/utils.py                              |  16 ++++
 agent/Auto-GPT                                    |   2 +-
 agent/SuperAGI                                    |   2 +-
 agent/config_example.json                         |   3 +-
 agent/gpt-engineer                                |   2 +-
 agent/mini-agi                                    |   2 +-
 agent/smol-developer                              |   2 +-
 config.json                                       |   6 --
 mypy.ini                                          |   2 +-
 regression_tests.json                             |  90 ------------------
 21 files changed, 375 insertions(+), 188 deletions(-)
 delete mode 100644 agbenchmark/RegressionManager.py
 create mode 100644 agbenchmark/ReportManager.py
 create mode 100644 agbenchmark/config.json
 create mode 100644 agbenchmark/regression_tests.json
 create mode 100644 agbenchmark/reports/1.json
 delete mode 100644 config.json
 delete mode 100644 regression_tests.json

diff --git a/agbenchmark/RegressionManager.py b/agbenchmark/RegressionManager.py
deleted file mode 100644
index ac9efc696..000000000
--- a/agbenchmark/RegressionManager.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import json
-from typing import Union
-
-
-class RegressionManager:
-    """Abstracts interaction with the regression tests file"""
-
-    def __init__(self, filename: str):
-        self.filename = filename
-        self.load()
-
-    def load(self) -> None:
-        try:
-            with open(self.filename, "r") as f:
-                file_content = (
-                    f.read().strip()
-                )  # read the content and remove any leading/trailing whitespace
-                if file_content:  # if file is not empty, load the json
-                    data = json.loads(file_content)
-                    self.tests = {k: data[k] for k in sorted(data)}
-                    data = self.replace_backslash(data)
-                else:  # if file is empty, assign an empty dictionary
-                    self.tests = {}
-        except FileNotFoundError:
-            self.tests = {}
-        except json.decoder.JSONDecodeError:  # If JSON is invalid
-            self.tests = {}
-        self.save()
-
-    def save(self) -> None:
-        with open(self.filename, "w") as f:
-            json.dump(self.tests, f, indent=4)
-
-    def add_test(self, test_name: str, test_details: dict) -> None:
-        self.tests[test_name] = test_details
-        self.save()
-
-    def remove_test(self, test_name: str) -> None:
-        if test_name in self.tests:
-            del self.tests[test_name]
-            self.save()
-
-    def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
-        if isinstance(value, str):
-            return value.replace("\\\\", "/")  # escape \ with \\
-        elif isinstance(value, list):
-            return [self.replace_backslash(i) for i in value]
-        elif isinstance(value, dict):
-            return {k: self.replace_backslash(v) for k, v in value.items()}
-        else:
-            return value
diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py
new file mode 100644
index 000000000..e6d8f62f6
--- /dev/null
+++ b/agbenchmark/ReportManager.py
@@ -0,0 +1,68 @@
+import json
+import os
+import sys
+import time
+from datetime import datetime
+from typing import Any, Dict, Union
+
+
+class ReportManager:
+    """Abstracts interaction with the regression tests file"""
+
+    def __init__(self, filename: str):
+        self.filename = filename
+        self.start_time = time.time()
+        self.load()
+
+    def load(self) -> None:
+        try:
+            with open(self.filename, "r") as f:
+                file_content = (
+                    f.read().strip()
+                )  # read the content and remove any leading/trailing whitespace
+                if file_content:  # if file is not empty, load the json
+                    data = json.loads(file_content)
+                    self.tests = {k: data[k] for k in sorted(data)}
+                    data = self.replace_backslash(data)
+                else:  # if file is empty, assign an empty dictionary
+                    self.tests = {}
+        except FileNotFoundError:
+            self.tests = {}
+        except json.decoder.JSONDecodeError:  # If JSON is invalid
+            self.tests = {}
+        self.save()
+
+    def save(self) -> None:
+        with open(self.filename, "w") as f:
+            json.dump(self.tests, f, indent=4)
+
+    def add_test(self, test_name: str, test_details: dict) -> None:
+        self.tests[test_name] = test_details
+        self.save()
+
+    def remove_test(self, test_name: str) -> None:
+        if test_name in self.tests:
+            del self.tests[test_name]
+            self.save()
+
+    def end_info_report(self, config: Dict[str, Any]) -> None:
+        command = " ".join(sys.argv)
+        self.tests = {
+            "command": command.split(os.sep)[-1],
+            "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
+            "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
+            "tests": self.tests,
+            "config": config,
+        }
+
+        self.save()
+
+    def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
+        if isinstance(value, str):
+            return value.replace("\\\\", "/")  # escape \ with \\
+        elif isinstance(value, list):
+            return [self.replace_backslash(i) for i in value]
+        elif isinstance(value, dict):
+            return {k: self.replace_backslash(v) for k, v in value.items()}
+        else:
+            return value
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 1d43577c7..d058ad4c2 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -3,6 +3,7 @@ import shutil
 import subprocess
 import sys
 import time
+from pathlib import Path
 from typing import Any, Dict
 
 from dotenv import load_dotenv
@@ -21,6 +22,7 @@ def run_agent(
     """Calling to get a response"""
 
     if MOCK_FLAG:
+        print("ITS A MOCK TEST", challenge_location)
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
@@ -30,19 +32,13 @@ def run_agent(
             f"Running Python function '{config['entry_path']}' with timeout {timeout}"
         )
 
-        # Get the current working directory
-        cwd = os.path.join(os.getcwd(), config["home_path"])
-
-        # Add current directory to Python's import path
-        sys.path.append(cwd)
-
         command = [sys.executable, config["entry_path"], str(task)]
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             universal_newlines=True,
-            cwd=cwd,
+            cwd=os.getcwd(),
         )
 
         start_time = time.time()
@@ -79,7 +75,9 @@ def run_agent(
 def copy_artifacts_into_workspace(
     workspace: str, artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
-    source_dir = os.path.join(challenge_dir_path, artifact_folder_name)
+    # this file is at agbenchmark\agent_interface.py
+    script_dir = Path(__file__).resolve().parent.parent
+    source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)
 
     # Check if source_dir exists, if not then return immediately.
     if not os.path.exists(source_dir):
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 94cba5b72..f4e3f2220 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -1,4 +1,5 @@
 import json
+from pathlib import Path
 from typing import List, Optional
 
 from pydantic import BaseModel
@@ -32,7 +33,12 @@ class ChallengeData(BaseModel):
 
     @staticmethod
     def deserialize(path: str) -> "ChallengeData":
+        # this script is in root/agbenchmark/challenges/define_task_types.py
+        script_dir = Path(__file__).resolve().parent.parent.parent
+        path = str(script_dir / path)
+
         print("Deserializing", path)
+
         with open(path, "r") as file:
             data = json.load(file)
         return ChallengeData(**data)
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
index 17ee1ac1a..f59b2dc9b 100644
--- a/agbenchmark/challenges/interface/search/data.json
+++ b/agbenchmark/challenges/interface/search/data.json
@@ -2,7 +2,7 @@
   "name": "TestSearch",
   "category": ["interface"],
   "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
-  "dependencies": [],
+  "dependencies": ["TestWriteFile"],
   "ground": {
     "answer": "This is a Heading\nThis is a paragraph.",
     "should_contain": ["Heading", "paragraph"],
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index 7dee0b2ab..f8bb23471 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -19,7 +19,7 @@ load_dotenv()
 IMPROVE = os.getenv("IMPROVE", "False")
 
 
-json_files = glob.glob(f"{CURRENT_DIRECTORY}/challenges/**/data.json", recursive=True)
+json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
 
 
 def get_test_path(json_file: str) -> str:
@@ -55,7 +55,7 @@ def generate_tests() -> None:
             )
             sys.path.append(str(custom_python_location))
 
-            for (module_loader, name, ispkg) in pkgutil.iter_modules(
+            for module_loader, name, ispkg in pkgutil.iter_modules(
                 [str(custom_python_location)]
             ):
                 module = importlib.import_module(name)
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
new file mode 100644
index 000000000..9dd8b16ab
--- /dev/null
+++ b/agbenchmark/config.json
@@ -0,0 +1,5 @@
+{
+  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+  "entry_path": "agbenchmark/benchmarks.py",
+  "cutoff": 60
+}
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index e321f5a26..87fdc9c10 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -6,9 +6,10 @@ from typing import Any, Dict, Generator
 
 import pytest
 
-from agbenchmark.RegressionManager import RegressionManager
+from agbenchmark.ReportManager import ReportManager
 from agbenchmark.start_benchmark import (
     CONFIG_PATH,
+    INFO_TESTS_PATH,
     REGRESSION_TESTS_PATH,
     get_regression_data,
 )
@@ -106,7 +107,8 @@ def challenge_data(request: Any) -> None:
     return request.param
 
 
-regression_manager = RegressionManager(REGRESSION_TESTS_PATH)
+regression_manager = ReportManager(REGRESSION_TESTS_PATH)
+info_manager = ReportManager(INFO_TESTS_PATH)
 
 
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
@@ -130,12 +132,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         print("pytest_runtest_makereport", test_details)
         if call.excinfo is None:
             regression_manager.add_test(item.nodeid.split("::")[1], test_details)
+            test_details["success"] = True
         else:
             regression_manager.remove_test(item.nodeid.split("::")[1])
+            test_details["success"] = False
+            test_details["fail_reason"] = str(call.excinfo.value)
 
+        info_manager.add_test(item.nodeid.split("::")[1], test_details)
 
-def pytest_sessionfinish() -> None:
-    """Called at the end of the session to save regression tests"""
+
+def pytest_sessionfinish(session: Any) -> None:
+    """Called at the end of the session to save regression tests and info"""
+    with open(CONFIG_PATH, "r") as f:
+        config = json.load(f)
+
+    info_manager.end_info_report(config)
     regression_manager.save()
 
 
diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json
new file mode 100644
index 000000000..68632a127
--- /dev/null
+++ b/agbenchmark/regression_tests.json
@@ -0,0 +1,99 @@
+{
+    "TestReadFile": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/interface/read_file",
+        "success": true
+    },
+    "TestBasicMemory": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestReadFile",
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/memory/m1",
+        "success": true
+    },
+    "TestBasicRetrieval": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile",
+            "TestSearch"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r1",
+        "success": true
+    },
+    "TestRememberMultipleIds": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicMemory"
+        ],
+        "test": "agbenchmark/challenges/memory/m2",
+        "success": true
+    },
+    "TestRetrieval2": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestBasicRetrieval"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r2",
+        "success": true
+    },
+    "TestRememberMultipleIdsWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIds"
+        ],
+        "test": "agbenchmark/challenges/memory/m3",
+        "success": true
+    },
+    "TestRetrieval3": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestRetrieval2"
+        ],
+        "test": "agbenchmark/challenges/retrieval/r3",
+        "success": true
+    },
+    "TestRememberMultiplePhrasesWithNoise": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestRememberMultipleIdsWithNoise"
+        ],
+        "test": "agbenchmark/challenges/memory/m4",
+        "success": true
+    },
+    "TestSearch": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/interface/search",
+        "success": true
+    },
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "test": "agbenchmark/challenges/interface/write_file",
+        "success": true
+    },
+    "TestDebugSimpleTypoWithGuidance": {
+        "difficulty": "basic",
+        "dependencies": [
+            "TestReadFile",
+            "TestWriteFile"
+        ],
+        "test": "agbenchmark/challenges/code/d1",
+        "success": true
+    },
+    "TestDebugSimpleTypoWithoutGuidance": {
+        "difficulty": "medium",
+        "dependencies": [
+            "TestDebugSimpleTypoWithGuidance"
+        ],
+        "test": "agbenchmark/challenges/code/d2",
+        "success": true
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json
new file mode 100644
index 000000000..df07fb878
--- /dev/null
+++ b/agbenchmark/reports/1.json
@@ -0,0 +1,109 @@
+{
+    "command": "agbenchmark start --mock",
+    "completion_time": "2023-07-10-21:19",
+    "time_elapsed": "8.75 seconds",
+    "tests": {
+        "TestWriteFile": {
+            "difficulty": "basic",
+            "dependencies": [],
+            "test": "agbenchmark/challenges/interface/write_file",
+            "success": true
+        },
+        "TestReadFile": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/interface/read_file",
+            "success": true
+        },
+        "TestSearch": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/interface/search",
+            "success": true
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestReadFile",
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/code/d1",
+            "success": true
+        },
+        "TestBasicMemory": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestReadFile",
+                "TestWriteFile"
+            ],
+            "test": "agbenchmark/challenges/memory/m1",
+            "success": true
+        },
+        "TestBasicRetrieval": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestWriteFile",
+                "TestSearch"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r1",
+            "success": true
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestDebugSimpleTypoWithGuidance"
+            ],
+            "test": "agbenchmark/challenges/code/d2",
+            "success": true
+        },
+        "TestRememberMultipleIds": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestBasicMemory"
+            ],
+            "test": "agbenchmark/challenges/memory/m2",
+            "success": true
+        },
+        "TestRetrieval2": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestBasicRetrieval"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r2",
+            "success": true
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestRememberMultipleIds"
+            ],
+            "test": "agbenchmark/challenges/memory/m3",
+            "success": true
+        },
+        "TestRetrieval3": {
+            "difficulty": "basic",
+            "dependencies": [
+                "TestRetrieval2"
+            ],
+            "test": "agbenchmark/challenges/retrieval/r3",
+            "success": true
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "difficulty": "medium",
+            "dependencies": [
+                "TestRememberMultipleIdsWithNoise"
+            ],
+            "test": "agbenchmark/challenges/memory/m4",
+            "success": true
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+        "entry_path": "agbenchmark/benchmarks.py",
+        "cutoff": 60
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 68c7932be..917cd4e8a 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -10,12 +10,16 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
+from agbenchmark.utils import calculate_info_test_path
+
 CURRENT_DIRECTORY = Path(__file__).resolve().parent
 
+benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark"
 
-CONFIG_PATH = str(Path(os.getcwd()) / "config.json")
+CONFIG_PATH = str(benchmarks_folder_path / "config.json")
+REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json")
 
-REGRESSION_TESTS_PATH = str(Path(os.getcwd()) / "regression_tests.json")
+INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path)
 
 
 @click.group()
@@ -25,10 +29,11 @@ def cli() -> None:
 
 @cli.command()
 @click.option("--category", default=None, help="Specific category to run")
+@click.option("--test", default=None, help="Specific test to run")
 @click.option("--maintain", is_flag=True, help="Runs only regression tests")
 @click.option("--improve", is_flag=True, help="Run only non-regression tests")
 @click.option("--mock", is_flag=True, help="Run with mock")
-def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
+def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -> int:
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
     if maintain and improve:
@@ -37,6 +42,16 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
         )
         return 1
 
+    if test and (category or maintain or improve):
+        print(
+            "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
+        )
+        return 1
+
+    if not benchmarks_folder_path.exists():
+        benchmarks_folder_path.mkdir(exist_ok=True)
+
+    print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size)
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
         config = {}
 
@@ -46,12 +61,12 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
         )
 
         config["entry_path"] = click.prompt(
-            "Please enter a the path to your run_specific_agent function implementation",
-            default="/benchmarks.py",
+            "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
+            default="benchmarks.py",
         )
 
         config["cutoff"] = click.prompt(
-            "Please enter a hard cutoff runtime for your agent",
+            "Please enter a hard cutoff runtime for your agent per test",
             default="60",
         )
 
@@ -65,7 +80,11 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
     os.environ["MOCK_TEST"] = "True" if mock else "False"
 
     if not os.path.exists(REGRESSION_TESTS_PATH):
-        with open(REGRESSION_TESTS_PATH, "a"):
+        with open(REGRESSION_TESTS_PATH, "w"):
+            pass
+
+    if not os.path.exists(INFO_TESTS_PATH):
+        with open(INFO_TESTS_PATH, "w"):
             pass
 
     print("Current configuration:")
@@ -73,18 +92,22 @@ def start(category: str, maintain: bool, improve: bool, mock: bool) -> int:
         print(f"{key}: {value}")
 
     pytest_args = ["-vs"]
-    if category:
-        pytest_args.extend(["-m", category])
-        print("Starting benchmark tests ", category)
+    if test:
+        print("Running specific test:", test)
+        pytest_args.extend(["-k", test])
     else:
-        print("Running all categories")
-
-    if maintain:
-        print("Running only regression tests")
-        pytest_args.append("--maintain")
-    elif improve:
-        print("Running only non-regression tests")
-        pytest_args.append("--improve")
+        if category:
+            pytest_args.extend(["-m", category])
+            print("Running tests of category:", category)
+        else:
+            print("Running all categories")
+
+        if maintain:
+            print("Running only regression tests")
+            pytest_args.append("--maintain")
+        elif improve:
+            print("Running only non-regression tests")
+            pytest_args.append("--improve")
 
     if mock:
         pytest_args.append("--mock")
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index b05a7ac31..ffde0c6d3 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1 +1,17 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
+import glob
+from pathlib import Path
+
+
+def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
+    INFO_TESTS_PATH = benchmarks_folder_path / "reports"
+
+    if not INFO_TESTS_PATH.exists():
+        INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
+        return str(INFO_TESTS_PATH / "1.json")
+    else:
+        json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
+        file_count = len(json_files)
+        run_name = f"{file_count + 1}.json"
+        new_file_path = INFO_TESTS_PATH / run_name
+        return str(new_file_path)
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index f360d503b..dc2a76990 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit f360d503b113119f6b3ce0acff1dbb4dfae2223a
+Subproject commit dc2a76990c75fafacbeaa76eb2e27d48de44cadd
diff --git a/agent/SuperAGI b/agent/SuperAGI
index 7ab2994d4..a28224d82 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit 7ab2994d4b44fa008f9ac27b196f134d27878916
+Subproject commit a28224d82572b598ccee1057086fabaf33e1aaa9
diff --git a/agent/config_example.json b/agent/config_example.json
index ba2ec0b80..7ab65bc20 100644
--- a/agent/config_example.json
+++ b/agent/config_example.json
@@ -1,6 +1,5 @@
 {
   "workspace": "projects/my-new-project/workspace",
-  "entry_path": "benchmarks.py",
-  "home_path": "",
+  "entry_path": "agbenchmark/benchmarks.py",
   "cutoff": 60
 }
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 4af8c137e..cde9be3e7 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 4af8c137e82cc51fdd31c23327ceffd64194b984
+Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333
diff --git a/agent/mini-agi b/agent/mini-agi
index 4af8a7e60..ad2b34505 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 4af8a7e6085f0518f06180fbf87024a2c9db4c88
+Subproject commit ad2b345050e07efb7ad0bde68c93bc2b4e2d7a92
diff --git a/agent/smol-developer b/agent/smol-developer
index a1e4a9ff3..c52b14b1d 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit a1e4a9ff3a75909c4a892e409a55f86a2c57b7c6
+Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d
diff --git a/config.json b/config.json
deleted file mode 100644
index 8bbcebdbd..000000000
--- a/config.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "benchmarks.py",
-  "home_path": "agent/mini-agi",
-  "cutoff": 60
-}
diff --git a/mypy.ini b/mypy.ini
index 764c239f1..d35c6962d 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -15,5 +15,5 @@ ignore_errors = True
 [mypy-agbenchmark.mocks.tests.basic_mocks.*]
 ignore_errors = True
 
-[mypy-agbenchmark.tests.regression.RegressionManager.*]
+[mypy-agbenchmark.tests.regression.ReportManager.*]
 ignore_errors = True
diff --git a/regression_tests.json b/regression_tests.json
deleted file mode 100644
index 0cf2d5f30..000000000
--- a/regression_tests.json
+++ /dev/null
@@ -1,90 +0,0 @@
-{
-    "TestBasicMemory": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestReadFile",
-            "TestWriteFile"
-        ],
-        "test": "agbenchmark/challenges/memory/m1"
-    },
-    "TestBasicRetrieval": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile",
-            "TestSearch"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r1"
-    },
-    "TestCreateSimpleWebServer": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/code/d3"
-    },
-    "TestDebugSimpleTypoWithGuidance": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestReadFile",
-            "TestWriteFile"
-        ],
-        "test": "agbenchmark/challenges/code/d1"
-    },
-    "TestDebugSimpleTypoWithoutGuidance": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestDebugSimpleTypoWithGuidance"
-        ],
-        "test": "agbenchmark/challenges/code/d2"
-    },
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile"
-        ],
-        "test": "agbenchmark/challenges/interface/read_file"
-    },
-    "TestRememberMultipleIds": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestBasicMemory"
-        ],
-        "test": "agbenchmark/challenges/memory/m2"
-    },
-    "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIds"
-        ],
-        "test": "agbenchmark/challenges/memory/m3"
-    },
-    "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestRememberMultipleIdsWithNoise"
-        ],
-        "test": "agbenchmark/challenges/memory/m4"
-    },
-    "TestRetrieval2": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestBasicRetrieval"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r2"
-    },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestRetrieval2"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r3"
-    },
-    "TestSearch": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/search"
-    },
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/write_file"
-    }
-}
\ No newline at end of file
-- 
cgit v1.2.3


From 0799be7e28bf4805e5cd2c9296c142b31f9501a4 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Mon, 10 Jul 2023 21:54:25 -0700
Subject: Fix tests ci (#82)

---
 .github/workflows/ci.yml           |  7 +------
 agbenchmark/agent_interface.py     |  8 +++++---
 agbenchmark/challenges/test_all.py | 32 ++++++++++++++------------------
 agent/Auto-GPT                     |  2 +-
 agent/gpt-engineer                 |  2 +-
 agent/smol-developer               |  2 +-
 poetry.lock                        | 29 ++++++++++++++++++++++++++++-
 pyproject.toml                     |  1 +
 8 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cac1dedb1..9df4173b2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -75,6 +75,7 @@ jobs:
   tests:
     name: ${{ matrix.agent-name }}
     runs-on: ubuntu-latest
+    timeout-minutes: 10
     env:
       min-python-version: "3.10"
     strategy:
@@ -156,16 +157,10 @@ jobs:
           if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then
             agbenchmark start --maintain
           else
-            exit 0
             agbenchmark start --maintain --mock
             agbenchmark start --improve --mock
             agbenchmark start --mock
             agbenchmark start --mock --category=retrieval
-            agbenchmark start --mock --category=regression
-            agbenchmark start --mock --category=interface
-            agbenchmark start --mock --category=code
-            agbenchmark start --mock --category=memory
-            agbenchmark start --mock --category=memory --category=code
           fi
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index d058ad4c2..713451f01 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -3,11 +3,12 @@ import shutil
 import subprocess
 import sys
 import time
-from pathlib import Path
 from typing import Any, Dict
 
 from dotenv import load_dotenv
 
+from agbenchmark.start_benchmark import CURRENT_DIRECTORY
+
 load_dotenv()
 
 mock_test_str = os.getenv("MOCK_TEST")
@@ -76,8 +77,9 @@ def copy_artifacts_into_workspace(
     workspace: str, artifact_folder_name: str, challenge_dir_path: str
 ) -> None:
     # this file is at agbenchmark\agent_interface.py
-    script_dir = Path(__file__).resolve().parent.parent
-    source_dir = os.path.join(script_dir, challenge_dir_path, artifact_folder_name)
+    source_dir = os.path.join(
+        CURRENT_DIRECTORY, "..", challenge_dir_path, artifact_folder_name
+    )
 
     # Check if source_dir exists, if not then return immediately.
     if not os.path.exists(source_dir):
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index f8bb23471..00a6ed635 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -49,19 +49,6 @@ def generate_tests() -> None:
             class_name = data.get("name", "")
 
         challenge_location = get_test_path(json_file)
-        if data["ground"]["type"] == "custom_python":
-            custom_python_location = (
-                f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python"
-            )
-            sys.path.append(str(custom_python_location))
-
-            for module_loader, name, ispkg in pkgutil.iter_modules(
-                [str(custom_python_location)]
-            ):
-                module = importlib.import_module(name)
-
-                if hasattr(module, "make_assertion"):
-                    make_assertion = getattr(module, "make_assertion")
 
         # Define test class dynamically
         challenge_class = types.new_class(class_name, (Challenge,))
@@ -75,11 +62,20 @@ def generate_tests() -> None:
             scores = self.get_scores(config)
 
             # Check if make_assertion is defined and use it
-            if "make_assertion" in locals():
-                try:
-                    make_assertion()
-                except AssertionError as error:
-                    print(error)  # Or handle this in another way
+            if self.data.ground.type == "custom_python":
+                custom_python_location = (
+                    f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python"
+                )
+                sys.path.append(str(custom_python_location))
+
+                for (module_loader, name, ispkg) in pkgutil.iter_modules(
+                    [str(custom_python_location)]
+                ):
+                    module = importlib.import_module(name)
+
+                    if hasattr(module, "make_assertion"):
+                        make_assertion = getattr(module, "make_assertion")
+                        make_assertion()
             else:
                 assert 1 in scores
 
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index dc2a76990..ade8e6f81 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit dc2a76990c75fafacbeaa76eb2e27d48de44cadd
+Subproject commit ade8e6f8142a937160596a987ab96808b583f9e3
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index cde9be3e7..538bcba6e 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333
+Subproject commit 538bcba6efbb7cda7f6a355a8c8420bbbdb52f25
diff --git a/agent/smol-developer b/agent/smol-developer
index c52b14b1d..150981f77 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d
+Subproject commit 150981f77f19777bf5aa76cb3a74869e4a8a8a05
diff --git a/poetry.lock b/poetry.lock
index 5526da16b..ad72f5e10 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -729,6 +729,21 @@ files = [
     {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"},
 ]
 
+[[package]]
+name = "pexpect"
+version = "4.8.0"
+description = "Pexpect allows easy control of interactive console applications."
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"},
+    {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"},
+]
+
+[package.dependencies]
+ptyprocess = ">=0.5"
+
 [[package]]
 name = "platformdirs"
 version = "3.8.0"
@@ -761,6 +776,18 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+description = "Run a subprocess in a pseudo terminal"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
+    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
+]
+
 [[package]]
 name = "pycodestyle"
 version = "2.7.0"
@@ -1109,4 +1136,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "81b84bbe08d4a09fb6a4f99c7fb018e0c0fcd879fa368c388b0af20c7c9a3f31"
+content-hash = "09871e879785f0a7d5c31a61553cd2df08d88324a864b9c56b8e97d95893157f"
diff --git a/pyproject.toml b/pyproject.toml
index 1a96a51de..b0526ab57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ pytest-depends = "^1.0.1"
 python-dotenv = "^0.21.0"
 click = "^8.1.3"
 types-requests = "^2.31.0.1"
+pexpect = "^4.8.0"
 
 [tool.poetry.group.dev.dependencies]
 flake8 = "^3.9.2"
-- 
cgit v1.2.3


From 46f31cb643a4803c04f0a1cb5af8bde6afd0a90e Mon Sep 17 00:00:00 2001
From: Luke <2609441+lc0rp@users.noreply.github.com>
Date: Tue, 11 Jul 2023 07:40:33 -0400
Subject: Bulletin & version update for 0.4.4 (#4937)

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
Co-authored-by: lc0rp <2609411+lc0rp@users.noreply.github.com>
---
 BULLETIN.md    | 43 +++++++++++++++++++++++++------------------
 pyproject.toml |  2 +-
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/BULLETIN.md b/BULLETIN.md
index 0b8afeba4..117a436a8 100644
--- a/BULLETIN.md
+++ b/BULLETIN.md
@@ -1,22 +1,29 @@
-# Website and Documentation Site 📰📖
-Check out *https://agpt.co*, the official news & updates site for Auto-GPT!
-The documentation also has a place here, at *https://docs.agpt.co*
+# QUICK LINKS 🔗
+# --------------
+🌎 *Official Website*: https://agpt.co.
+📖 *User Guide*: https://docs.agpt.co.
+👩 *Contributors Wiki*: https://github.com/Significant-Gravitas/Auto-GPT/wiki/Contributing.
 
-# For contributors 👷🏼
-Since releasing v0.3.0, whave been working on re-architecting the Auto-GPT core to make it more extensible and make room for structural performance-oriented R&D.
+# v0.4.4 RELEASE HIGHLIGHTS! 🚀
+# -----------------------------
+## GPT-4 is back!
+Following OpenAI's recent GPT-4 GA announcement, the SMART_LLM .env setting 
+now defaults to GPT-4, and Auto-GPT will use GPT-4 by default in its main loop.
 
-Check out the contribution guide on our wiki:
-https://github.com/Significant-Gravitas/Auto-GPT/wiki/Contributing
+### !! High Costs Warning !! 💰💀🚨
+GPT-4 costs ~20x more than GPT-3.5-turbo. 
+Please take note of this before using SMART_LLM. You can use `--gpt3only` 
+or `--gpt4only` to force the use of GPT-3.5-turbo or GPT-4, respectively, 
+at runtime.
 
-# 🚀 v0.4.3 Release 🚀
-We're happy to announce the 0.4.3 maintenance release, which primarily focuses on refining the LLM command execution,
-extending support for OpenAI's latest models (including the powerful GPT-3 16k model), and laying the groundwork 
-for future compatibility with OpenAI's function calling feature.
+## Re-arch v1 preview release!
+We've released a preview version of the re-arch code, under `autogpt/core`. 
+This is a major milestone for us, and we're excited to continue working on it. 
+We look forward to your feedback. Follow the process here: 
+https://github.com/Significant-Gravitas/Auto-GPT/issues/4770.
 
-Key Highlights:
-- OpenAI API Key Prompt: Auto-GPT will now courteously prompt users for their OpenAI API key, if it's not already provided.
-- Summarization Enhancements: We've optimized Auto-GPT's use of the LLM context window even further.
-- JSON Memory Reading: Support for reading memories from JSON files has been improved, resulting in enhanced task execution.
-- Deprecated commands, removed for a leaner, more performant LLM: analyze_code, write_tests, improve_code, audio_text, web_playwright, web_requests
-## Take a look at the Release Notes on Github for the full changelog!
-https://github.com/Significant-Gravitas/Auto-GPT/releases
+## Other highlights
+Other fixes include plugins regressions, Azure config and security patches.
+
+Take a look at the Release Notes on Github for the full changelog! 
+https://github.com/Significant-Gravitas/Auto-GPT/releases.
diff --git a/pyproject.toml b/pyproject.toml
index b0aea625c..06b2f87f8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agpt"
-version = "0.4.3"
+version = "0.4.4"
 authors = [
   { name="Torantulino", email="support@agpt.co" },
 ]
-- 
cgit v1.2.3


From 22295350a63cad4ab0be2af83e68cc8e106b7201 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 11 Jul 2023 09:57:53 -0700
Subject: All Agents log to helicone automatically (#85)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
Co-authored-by: Justin <justintorre75@gmail.com>
---
 .github/workflows/ci.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9df4173b2..4d52dd027 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,17 +2,16 @@ name: CI
 
 on:
   workflow_dispatch:
-    branches: [ master ]
+    branches: [master]
   schedule:
-    - cron: '0 8 * * *'
+    - cron: "0 8 * * *"
   push:
-    branches: [ master, ci-test* ]
+    branches: [master, ci-test*]
   pull_request:
-    branches: [ stable, master, release-* ]
+    branches: [stable, master, release-*]
 
 jobs:
   lint:
-
     runs-on: ubuntu-latest
     env:
       min-python-version: "3.10"
@@ -83,10 +82,9 @@ jobs:
       matrix:
         agent-name:
           - "gpt-engineer"
+          - "smol-developer"
           - "Auto-GPT"
           - "mini-agi"
-          - "smol-developer"
-
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
@@ -151,10 +149,11 @@ jobs:
             echo "Unknown agent name: $AGENT_NAME"
             exit 1
           fi
-          
+
           pip install ../../dist/*.whl
 
           if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then
+            curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
             agbenchmark start --maintain
           else
             agbenchmark start --maintain --mock
@@ -168,6 +167,8 @@ jobs:
           AGENT_NAME: ${{ matrix.agent-name }}
           PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards.
           HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
+          REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
+          HELICONE_CACHE_ENABLED: false
 
       - name: Upload logs as artifact
         if: always()
-- 
cgit v1.2.3


From 4ecb70c5e3e2fbf63780ba983cc1e96eea251541 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 11 Jul 2023 12:11:24 -0700
Subject: Fix Auto-GPT integration by adding python module as entrypoint (#86)

Co-authored-by: Silen Naihin <silen.naihin@gmail.com>
---
 agbenchmark/agent_interface.py | 3 +--
 agbenchmark/config.json        | 2 +-
 agent/Auto-GPT                 | 2 +-
 agent/SuperAGI                 | 2 +-
 agent/gpt-engineer             | 2 +-
 agent/mini-agi                 | 2 +-
 agent/smol-developer           | 2 +-
 7 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 713451f01..c737f3079 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -32,8 +32,7 @@ def run_agent(
         print(
             f"Running Python function '{config['entry_path']}' with timeout {timeout}"
         )
-
-        command = [sys.executable, config["entry_path"], str(task)]
+        command = [sys.executable, "-m", config["entry_path"], str(task)]
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 9dd8b16ab..af83029ef 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,5 @@
 {
   "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "agbenchmark/benchmarks.py",
+  "entry_path": "agbenchmark.benchmarks",
   "cutoff": 60
 }
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index ade8e6f81..e5fbe4313 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit ade8e6f8142a937160596a987ab96808b583f9e3
+Subproject commit e5fbe4313e0ebf7f75514a181a5d2044a7babd26
diff --git a/agent/SuperAGI b/agent/SuperAGI
index a28224d82..928051291 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit a28224d82572b598ccee1057086fabaf33e1aaa9
+Subproject commit 9280512910c74bc33333e2ce7c48e47021227529
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 538bcba6e..42400fd67 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 538bcba6efbb7cda7f6a355a8c8420bbbdb52f25
+Subproject commit 42400fd67972278e454621e7abf450a4f899a44a
diff --git a/agent/mini-agi b/agent/mini-agi
index ad2b34505..6a1d08880 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit ad2b345050e07efb7ad0bde68c93bc2b4e2d7a92
+Subproject commit 6a1d08880c65fe3e5831243c1e1ea19acf85516c
diff --git a/agent/smol-developer b/agent/smol-developer
index 150981f77..a0e9f4f39 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit 150981f77f19777bf5aa76cb3a74869e4a8a8a05
+Subproject commit a0e9f4f39e26a56b13a364be09fc58d2d85150ea
-- 
cgit v1.2.3


From b3c506cd943f82f65720c116a770d062a37e0982 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 11 Jul 2023 17:02:29 -0700
Subject: Fix Auto-GPT looping forever (#87)

---
 agbenchmark/agent_interface.py | 50 +++++++++++++++++++++++++-----------------
 agent/Auto-GPT                 |  2 +-
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index c737f3079..a1a79ada0 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -42,35 +42,45 @@ def run_agent(
         )
 
         start_time = time.time()
-        timeout = config["cutoff"]
 
-        while True:
-            if process.stdout is None:
-                continue
+        print(
+            f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
+        )
+        command = [sys.executable, "-m", config["entry_path"], str(task)]
+        process = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            universal_newlines=True,
+        )
 
-            while output := process.stdout.readline():
-                print(output.strip())
+        start_time = time.time()
 
-            # Check if process has ended
-            if process.poll() is not None:
-                print("The Python function has finished running.")
-                break
+        while True:
+            output = ""
+            if process.stdout is not None:
+                output = process.stdout.readline()
+                print(output.strip())
 
-            # Check if process has exceeded timeout
-            if time.time() - start_time > timeout:
-                print(
-                    "The Python function has exceeded the time limit and was terminated."
-                )
-                # Terminate the process group
-                process.terminate()
+            # Check if process has ended, has no more output, or exceeded timeout
+            if (
+                process.poll() is not None
+                or output == ""
+                or (time.time() - start_time > config["cutoff"])
+            ):
                 break
 
-            # Optional: sleep for a while
-            time.sleep(0.1)
+        if time.time() - start_time > config["cutoff"]:
+            print("The Python function has exceeded the time limit and was terminated.")
+            process.kill()
+        else:
+            print("The Python function has finished running.")
 
-        # Wait for process to terminate, then get return code
         process.wait()
 
+        if process.returncode != 0:
+            print(f"The agent timed out")
+
 
 def copy_artifacts_into_workspace(
     workspace: str, artifact_folder_name: str, challenge_dir_path: str
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index e5fbe4313..d4fc134f8 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit e5fbe4313e0ebf7f75514a181a5d2044a7babd26
+Subproject commit d4fc134f8c4bd7b63f283f932f68932317f53f78
-- 
cgit v1.2.3


From 504634b4a6d9a1bb327b026694f2bf1692226bee Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 11 Jul 2023 20:50:56 -0700
Subject: Add custom properties to Helicone (#91)

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4d52dd027..50adac76d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -169,6 +169,7 @@ jobs:
           HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
           REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
           HELICONE_CACHE_ENABLED: false
+          HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
 
       - name: Upload logs as artifact
         if: always()
-- 
cgit v1.2.3


From e292ffebaff80d9eaeaea6c5c8600a5d53361e5f Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 11 Jul 2023 21:37:49 -0700
Subject: Enable cache (#92)

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 50adac76d..45bd64fff 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -168,7 +168,7 @@ jobs:
           PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards.
           HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
           REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
-          HELICONE_CACHE_ENABLED: false
+          HELICONE_CACHE_ENABLED: true
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
 
       - name: Upload logs as artifact
-- 
cgit v1.2.3


From 8d0c5179ed94fcf673403293c4664be4da542333 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Wed, 12 Jul 2023 01:37:59 -0400
Subject: fixing backslashes, adding basic metrics (#89)

---
 agbenchmark/ReportManager.py                       |  23 +--
 agbenchmark/agent_interface.py                     |  16 --
 agbenchmark/challenges/code/d1/data.json           |   2 +-
 agbenchmark/challenges/code/d2/data.json           |   4 +-
 agbenchmark/challenges/code/d3/data.json           |   2 +-
 agbenchmark/challenges/define_task_types.py        |  41 ++++-
 .../challenges/interface/read_file/data.json       |   2 +-
 agbenchmark/challenges/interface/search/data.json  |   2 +-
 .../challenges/interface/write_file/data.json      |   2 +-
 agbenchmark/challenges/memory/m1/data.json         |   6 +-
 agbenchmark/challenges/memory/m2/data.json         |   4 +-
 agbenchmark/challenges/memory/m3/data.json         |   4 +-
 agbenchmark/challenges/memory/m4/data.json         |   4 +-
 agbenchmark/challenges/retrieval/r1/data.json      |   2 +-
 agbenchmark/challenges/retrieval/r2/data.json      |   2 +-
 agbenchmark/challenges/retrieval/r3/data.json      |   2 +-
 agbenchmark/challenges/test_all.py                 |  15 +-
 agbenchmark/conftest.py                            |  83 ++++++++-
 agbenchmark/internal_info.json                     |  67 ++++++++
 agbenchmark/regression_tests.json                  |  75 +++-----
 agbenchmark/reports/1.json                         | 191 +++++++++++++--------
 agbenchmark/utils.py                               |  55 ++++++
 agent/SuperAGI                                     |   2 +-
 agent/gpt-engineer                                 |   2 +-
 agent/mini-agi                                     |   2 +-
 agent/smol-developer                               |   2 +-
 26 files changed, 412 insertions(+), 200 deletions(-)
 create mode 100644 agbenchmark/internal_info.json

diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py
index e6d8f62f6..cae13595a 100644
--- a/agbenchmark/ReportManager.py
+++ b/agbenchmark/ReportManager.py
@@ -3,7 +3,9 @@ import os
 import sys
 import time
 from datetime import datetime
-from typing import Any, Dict, Union
+from typing import Any, Dict
+
+from agbenchmark.utils import get_highest_success_difficulty
 
 
 class ReportManager:
@@ -23,7 +25,6 @@ class ReportManager:
                 if file_content:  # if file is not empty, load the json
                     data = json.loads(file_content)
                     self.tests = {k: data[k] for k in sorted(data)}
-                    data = self.replace_backslash(data)
                 else:  # if file is empty, assign an empty dictionary
                     self.tests = {}
         except FileNotFoundError:
@@ -36,8 +37,9 @@ class ReportManager:
         with open(self.filename, "w") as f:
             json.dump(self.tests, f, indent=4)
 
-    def add_test(self, test_name: str, test_details: dict) -> None:
+    def add_test(self, test_name: str, test_details: dict | list) -> None:
         self.tests[test_name] = test_details
+
         self.save()
 
     def remove_test(self, test_name: str) -> None:
@@ -50,19 +52,12 @@ class ReportManager:
         self.tests = {
             "command": command.split(os.sep)[-1],
             "completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
-            "time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
+            "metrics": {
+                "run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
+                "highest_difficulty": get_highest_success_difficulty(self.tests),
+            },
             "tests": self.tests,
             "config": config,
         }
 
         self.save()
-
-    def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
-        if isinstance(value, str):
-            return value.replace("\\\\", "/")  # escape \ with \\
-        elif isinstance(value, list):
-            return [self.replace_backslash(i) for i in value]
-        elif isinstance(value, dict):
-            return {k: self.replace_backslash(v) for k, v in value.items()}
-        else:
-            return value
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index a1a79ada0..991a7e8e0 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -23,26 +23,10 @@ def run_agent(
     """Calling to get a response"""
 
     if MOCK_FLAG:
-        print("ITS A MOCK TEST", challenge_location)
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_out", challenge_location
         )
     else:
-        timeout = config["cutoff"]
-        print(
-            f"Running Python function '{config['entry_path']}' with timeout {timeout}"
-        )
-        command = [sys.executable, "-m", config["entry_path"], str(task)]
-        process = subprocess.Popen(
-            command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            universal_newlines=True,
-            cwd=os.getcwd(),
-        )
-
-        start_time = time.time()
-
         print(
             f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
         )
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
index 0c7246000..061c924f5 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -13,6 +13,6 @@
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to debug python code with a simple typo in it.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
index 292301094..6523ef1d8 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -11,8 +11,8 @@
     "type": "execute_python_code"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "novice",
     "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json
index 07d607f5f..94c81664c 100644
--- a/agbenchmark/challenges/code/d3/data.json
+++ b/agbenchmark/challenges/code/d3/data.json
@@ -11,7 +11,7 @@
     "type": "custom_python"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "advanced",
     "description": "Tests ability for the agent to build a simple web server locally",
     "side_effects": []
   }
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index f4e3f2220..668025dd2 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -1,15 +1,52 @@
 import json
+from enum import Enum
 from pathlib import Path
 from typing import List, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, validator
+
+
+class DifficultyLevel(Enum):
+    interface = "interface"
+    basic = "basic"
+    novice = "novice"
+    intermediate = "intermediate"
+    advanced = "advanced"
+    expert = "expert"
+    human = "human"
+
+
+# map from enum to difficulty level (numeric)
+DIFFICULTY_MAP = {
+    DifficultyLevel.interface: 1,
+    DifficultyLevel.basic: 2,
+    DifficultyLevel.novice: 3,
+    DifficultyLevel.intermediate: 4,
+    DifficultyLevel.advanced: 5,
+    DifficultyLevel.expert: 6,
+    DifficultyLevel.human: 7,
+}
 
 
 class Info(BaseModel):
-    difficulty: str
+    difficulty: DifficultyLevel
     description: str
     side_effects: List[str]
 
+    @validator("difficulty", pre=True)
+    def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel:
+        """Convert a string to an instance of DifficultyLevel."""
+        if isinstance(v, DifficultyLevel):
+            return v
+
+        if isinstance(v, str):
+            try:
+                return DifficultyLevel(v.lower())
+            except ValueError:
+                pass
+
+        raise ValueError(f"Cannot convert {v} to DifficultyLevel.")
+
 
 class Ground(BaseModel):
     answer: str
diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json
index c827581b6..1bf340454 100644
--- a/agbenchmark/challenges/interface/read_file/data.json
+++ b/agbenchmark/challenges/interface/read_file/data.json
@@ -11,7 +11,7 @@
   },
   "info": {
     "description": "This reads the file quickly",
-    "difficulty": "basic",
+    "difficulty": "interface",
     "side_effects": [""]
   }
 }
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
index f59b2dc9b..de8934d95 100644
--- a/agbenchmark/challenges/interface/search/data.json
+++ b/agbenchmark/challenges/interface/search/data.json
@@ -11,7 +11,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "interface",
     "description": "Tests if an llm can search",
     "side_effects": [""]
   }
diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json
index 2be2d0dfe..8db9cd620 100644
--- a/agbenchmark/challenges/interface/write_file/data.json
+++ b/agbenchmark/challenges/interface/write_file/data.json
@@ -11,7 +11,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "interface",
     "description": "Tests the writing to file",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
index 506b246ad..ab86f1c3c 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -10,13 +10,9 @@
     "files": [".txt"],
     "type": "file"
   },
-  "mock": {
-    "mock_func": "basic_memory_mock",
-    "mock_task": "Follow the instructions in the instructions_1.txt file"
-  },
   "info": {
     "difficulty": "basic",
     "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
index 7ef2552d1..9205c99f1 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -11,8 +11,8 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "novice",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
index 720cce93c..3b78d614b 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -11,8 +11,8 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "intermediate",
     "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
index 61965206b..84f5c2b21 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -16,8 +16,8 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "medium",
+    "difficulty": "advanced",
     "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
index 4f3833dfc..e3e09302d 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -13,6 +13,6 @@
   "info": {
     "difficulty": "basic",
     "description": "Tests ability to retrieve information from a website.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
+    "side_effects": []
   }
 }
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
index 5bc2e96b4..977be4bcd 100644
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -11,7 +11,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "novice",
     "description": "Tests ability to retrieve information.",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index b918d3d4e..5504908ea 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -27,7 +27,7 @@
     "type": "file"
   },
   "info": {
-    "difficulty": "basic",
+    "difficulty": "intermediate",
     "description": "Tests ability to retrieve information.",
     "side_effects": ["tests if there is in fact an LLM attached"]
   }
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index 00a6ed635..a5afef96c 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -9,15 +9,10 @@ from pathlib import Path
 from typing import Any, Dict
 
 import pytest
-from dotenv import load_dotenv
 
 from agbenchmark.challenge import Challenge
 from agbenchmark.start_benchmark import CURRENT_DIRECTORY
-
-load_dotenv()
-
-IMPROVE = os.getenv("IMPROVE", "False")
-
+from agbenchmark.utils import replace_backslash
 
 json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
 
@@ -36,7 +31,11 @@ def get_test_path(json_file: str) -> str:
     # Create the path from "agbenchmark" onwards
     challenge_location = Path(*path.parts[agbenchmark_index:])
 
-    return str(challenge_location)
+    formatted_location = replace_backslash(str(challenge_location))
+    if isinstance(formatted_location, str):
+        return formatted_location
+    else:
+        return str(challenge_location)
 
 
 def generate_tests() -> None:
@@ -68,7 +67,7 @@ def generate_tests() -> None:
                 )
                 sys.path.append(str(custom_python_location))
 
-                for (module_loader, name, ispkg) in pkgutil.iter_modules(
+                for module_loader, name, ispkg in pkgutil.iter_modules(
                     [str(custom_python_location)]
                 ):
                     module = importlib.import_module(name)
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 87fdc9c10..b91b5f9f8 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -1,6 +1,8 @@
 import json
 import os
 import shutil
+import sys
+import time
 from pathlib import Path  # noqa
 from typing import Any, Dict, Generator
 
@@ -13,6 +15,7 @@ from agbenchmark.start_benchmark import (
     REGRESSION_TESTS_PATH,
     get_regression_data,
 )
+from agbenchmark.utils import calculate_success_percentage
 
 
 def resolve_workspace(config: Dict[str, Any]) -> str:
@@ -107,9 +110,29 @@ def challenge_data(request: Any) -> None:
     return request.param
 
 
+@pytest.fixture(autouse=True, scope="session")
+def mock(request: Any) -> None:
+    return request.config.getoption("--mock")
+
+
+@pytest.fixture(autouse=True, scope="function")
+def timer(request: Any) -> Any:
+    start_time = time.time()
+    yield
+    run_time = time.time() - start_time
+    request.node.user_properties.append(("run_time", run_time))
+
+
+# tests that consistently pass are considered regression tests
 regression_manager = ReportManager(REGRESSION_TESTS_PATH)
+
+# user facing reporting information
 info_manager = ReportManager(INFO_TESTS_PATH)
 
+INTERNAL_LOGS = Path(__file__).resolve().parent  # agbenchmark/conftest.py
+# internal db step in replacement track pass/fail rate
+internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json"))
+
 
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
     if call.when == "call":
@@ -122,23 +145,66 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         )
         # Extract the challenge_location from the class
         challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
+        test_name = item.nodeid.split("::")[1]
+        item.test_name = test_name
 
         test_details = {
             "difficulty": difficulty,
             "dependencies": dependencies,
-            "test": challenge_location,
+            "data_path": challenge_location,
+        }
+
+        info_details: Any = {
+            "data_path": challenge_location,
+            "is_regression": False,
+            "metrics": {
+                "difficulty": difficulty,
+                "success": False,
+            },
         }
 
-        print("pytest_runtest_makereport", test_details)
+        mock = "--mock" in sys.argv  # Check if --mock is in sys.argv
+
         if call.excinfo is None:
-            regression_manager.add_test(item.nodeid.split("::")[1], test_details)
-            test_details["success"] = True
+            info_details["metrics"]["success"] = True
         else:
-            regression_manager.remove_test(item.nodeid.split("::")[1])
-            test_details["success"] = False
-            test_details["fail_reason"] = str(call.excinfo.value)
+            if not mock:  # don't remove if it's a mock test
+                regression_manager.remove_test(test_name)
+            info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
+
+        prev_test_results: list[bool] = []
+
+        if not mock:
+            # only add if it's an actual test
+            prev_test_results = internal_info.tests.get(test_name, [])
+            prev_test_results.append(info_details["metrics"]["success"])
+            internal_info.add_test(test_name, prev_test_results)
+
+        # can calculate success rate regardless of mock
+        info_details["metrics"]["success_%"] = calculate_success_percentage(
+            prev_test_results
+        )
+
+        if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
+            # if the last 3 tests were successful, add to the regression tests
+            info_details["is_regression"] = True
+            regression_manager.add_test(test_name, test_details)
+
+        # user facing reporting
+        item.info_details = info_details
+    if call.when == "teardown":
+        run_time = dict(item.user_properties).get("run_time")
+
+        info_details = getattr(item, "info_details", {})
+        test_name = getattr(item, "test_name", "")
+
+        if info_details and test_name:
+            if run_time:
+                info_details["metrics"][
+                    "run_time"
+                ] = f"{str(round(run_time, 3))} seconds"
 
-        info_manager.add_test(item.nodeid.split("::")[1], test_details)
+            info_manager.add_test(test_name, info_details)
 
 
 def pytest_sessionfinish(session: Any) -> None:
@@ -146,6 +212,7 @@ def pytest_sessionfinish(session: Any) -> None:
     with open(CONFIG_PATH, "r") as f:
         config = json.load(f)
 
+    internal_info.save()
     info_manager.end_info_report(config)
     regression_manager.save()
 
diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json
new file mode 100644
index 000000000..5f46bd854
--- /dev/null
+++ b/agbenchmark/internal_info.json
@@ -0,0 +1,67 @@
+{
+    "TestBasicMemory": [
+        true,
+        true,
+        true
+    ],
+    "TestBasicRetrieval": [
+        true,
+        true,
+        true
+    ],
+    "TestCreateSimpleWebServer": [
+        false,
+        false,
+        false
+    ],
+    "TestDebugSimpleTypoWithGuidance": [
+        false,
+        false,
+        false
+    ],
+    "TestDebugSimpleTypoWithoutGuidance": [
+        false,
+        false,
+        false
+    ],
+    "TestReadFile": [
+        true,
+        true,
+        true
+    ],
+    "TestRememberMultipleIds": [
+        true,
+        true,
+        true
+    ],
+    "TestRememberMultipleIdsWithNoise": [
+        true,
+        true,
+        true
+    ],
+    "TestRememberMultiplePhrasesWithNoise": [
+        true,
+        true,
+        true
+    ],
+    "TestRetrieval2": [
+        true,
+        true,
+        true
+    ],
+    "TestRetrieval3": [
+        true,
+        true,
+        true
+    ],
+    "TestSearch": [
+        true,
+        true,
+        true
+    ],
+    "TestWriteFile": [
+        true,
+        true,
+        true
+    ]
+}
\ No newline at end of file
diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json
index 68632a127..ce73ce263 100644
--- a/agbenchmark/regression_tests.json
+++ b/agbenchmark/regression_tests.json
@@ -1,20 +1,11 @@
 {
-    "TestReadFile": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestWriteFile"
-        ],
-        "test": "agbenchmark/challenges/interface/read_file",
-        "success": true
-    },
     "TestBasicMemory": {
         "difficulty": "basic",
         "dependencies": [
             "TestReadFile",
             "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/memory/m1",
-        "success": true
+        "data_path": "agbenchmark/challenges/memory/m1"
     },
     "TestBasicRetrieval": {
         "difficulty": "basic",
@@ -22,78 +13,60 @@
             "TestWriteFile",
             "TestSearch"
         ],
-        "test": "agbenchmark/challenges/retrieval/r1",
-        "success": true
+        "data_path": "agbenchmark/challenges/retrieval/r1"
     },
-    "TestRememberMultipleIds": {
+    "TestReadFile": {
         "difficulty": "basic",
         "dependencies": [
-            "TestBasicMemory"
+            "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/memory/m2",
-        "success": true
+        "data_path": "agbenchmark/challenges/interface/read_file"
     },
-    "TestRetrieval2": {
+    "TestRememberMultipleIds": {
         "difficulty": "basic",
         "dependencies": [
-            "TestBasicRetrieval"
+            "TestBasicMemory"
         ],
-        "test": "agbenchmark/challenges/retrieval/r2",
-        "success": true
+        "data_path": "agbenchmark/challenges/memory/m2"
     },
     "TestRememberMultipleIdsWithNoise": {
         "difficulty": "medium",
         "dependencies": [
             "TestRememberMultipleIds"
         ],
-        "test": "agbenchmark/challenges/memory/m3",
-        "success": true
-    },
-    "TestRetrieval3": {
-        "difficulty": "basic",
-        "dependencies": [
-            "TestRetrieval2"
-        ],
-        "test": "agbenchmark/challenges/retrieval/r3",
-        "success": true
+        "data_path": "agbenchmark/challenges/memory/m3"
     },
     "TestRememberMultiplePhrasesWithNoise": {
         "difficulty": "medium",
         "dependencies": [
             "TestRememberMultipleIdsWithNoise"
         ],
-        "test": "agbenchmark/challenges/memory/m4",
-        "success": true
+        "data_path": "agbenchmark/challenges/memory/m4"
     },
-    "TestSearch": {
+    "TestRetrieval2": {
         "difficulty": "basic",
         "dependencies": [
-            "TestWriteFile"
+            "TestBasicRetrieval"
         ],
-        "test": "agbenchmark/challenges/interface/search",
-        "success": true
+        "data_path": "agbenchmark/challenges/retrieval/r2"
     },
-    "TestWriteFile": {
+    "TestRetrieval3": {
         "difficulty": "basic",
-        "dependencies": [],
-        "test": "agbenchmark/challenges/interface/write_file",
-        "success": true
+        "dependencies": [
+            "TestRetrieval2"
+        ],
+        "data_path": "agbenchmark/challenges/retrieval/r3"
     },
-    "TestDebugSimpleTypoWithGuidance": {
+    "TestSearch": {
         "difficulty": "basic",
         "dependencies": [
-            "TestReadFile",
             "TestWriteFile"
         ],
-        "test": "agbenchmark/challenges/code/d1",
-        "success": true
+        "data_path": "agbenchmark/challenges/interface/search"
     },
-    "TestDebugSimpleTypoWithoutGuidance": {
-        "difficulty": "medium",
-        "dependencies": [
-            "TestDebugSimpleTypoWithGuidance"
-        ],
-        "test": "agbenchmark/challenges/code/d2",
-        "success": true
+    "TestWriteFile": {
+        "difficulty": "basic",
+        "dependencies": [],
+        "data_path": "agbenchmark/challenges/interface/write_file"
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json
index df07fb878..45945a3ee 100644
--- a/agbenchmark/reports/1.json
+++ b/agbenchmark/reports/1.json
@@ -1,109 +1,148 @@
 {
     "command": "agbenchmark start --mock",
-    "completion_time": "2023-07-10-21:19",
-    "time_elapsed": "8.75 seconds",
+    "completion_time": "2023-07-11-21:09",
+    "metrics": {
+        "run_time": "0.96 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
     "tests": {
         "TestWriteFile": {
-            "difficulty": "basic",
-            "dependencies": [],
-            "test": "agbenchmark/challenges/interface/write_file",
-            "success": true
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.008 seconds"
+            }
         },
         "TestReadFile": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestWriteFile"
-            ],
-            "test": "agbenchmark/challenges/interface/read_file",
-            "success": true
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.005 seconds"
+            }
         },
         "TestSearch": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestWriteFile"
-            ],
-            "test": "agbenchmark/challenges/interface/search",
-            "success": true
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.006 seconds"
+            }
         },
         "TestDebugSimpleTypoWithGuidance": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestReadFile",
-                "TestWriteFile"
-            ],
-            "test": "agbenchmark/challenges/code/d1",
-            "success": true
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0,
+                "run_time": "0.489 seconds"
+            }
         },
         "TestBasicMemory": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestReadFile",
-                "TestWriteFile"
-            ],
-            "test": "agbenchmark/challenges/memory/m1",
-            "success": true
+            "data_path": "agbenchmark/challenges/memory/m1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.02 seconds"
+            }
         },
         "TestBasicRetrieval": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestWriteFile",
-                "TestSearch"
-            ],
-            "test": "agbenchmark/challenges/retrieval/r1",
-            "success": true
+            "data_path": "agbenchmark/challenges/retrieval/r1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.01 seconds"
+            }
         },
         "TestDebugSimpleTypoWithoutGuidance": {
-            "difficulty": "medium",
-            "dependencies": [
-                "TestDebugSimpleTypoWithGuidance"
-            ],
-            "test": "agbenchmark/challenges/code/d2",
-            "success": true
+            "data_path": "agbenchmark/challenges/code/d2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0,
+                "run_time": "0.001 seconds"
+            }
         },
         "TestRememberMultipleIds": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestBasicMemory"
-            ],
-            "test": "agbenchmark/challenges/memory/m2",
-            "success": true
+            "data_path": "agbenchmark/challenges/memory/m2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.018 seconds"
+            }
         },
         "TestRetrieval2": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestBasicRetrieval"
-            ],
-            "test": "agbenchmark/challenges/retrieval/r2",
-            "success": true
+            "data_path": "agbenchmark/challenges/retrieval/r2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.009 seconds"
+            }
         },
         "TestRememberMultipleIdsWithNoise": {
-            "difficulty": "medium",
-            "dependencies": [
-                "TestRememberMultipleIds"
-            ],
-            "test": "agbenchmark/challenges/memory/m3",
-            "success": true
+            "data_path": "agbenchmark/challenges/memory/m3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.022 seconds"
+            }
         },
         "TestRetrieval3": {
-            "difficulty": "basic",
-            "dependencies": [
-                "TestRetrieval2"
-            ],
-            "test": "agbenchmark/challenges/retrieval/r3",
-            "success": true
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.01 seconds"
+            }
         },
         "TestRememberMultiplePhrasesWithNoise": {
-            "difficulty": "medium",
-            "dependencies": [
-                "TestRememberMultipleIdsWithNoise"
-            ],
-            "test": "agbenchmark/challenges/memory/m4",
-            "success": true
+            "data_path": "agbenchmark/challenges/memory/m4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 0,
+                "run_time": "0.021 seconds"
+            }
         }
     },
     "config": {
         "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-        "entry_path": "agbenchmark/benchmarks.py",
+        "entry_path": "agbenchmark.benchmarks",
         "cutoff": 60
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index ffde0c6d3..598113d3d 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1,6 +1,10 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
 import glob
+import re
 from pathlib import Path
+from typing import Any
+
+from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel
 
 
 def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
@@ -15,3 +19,54 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
         run_name = f"{file_count + 1}.json"
         new_file_path = INFO_TESTS_PATH / run_name
         return str(new_file_path)
+
+
+def replace_backslash(value: Any) -> Any:
+    if isinstance(value, str):
+        return re.sub(
+            r"\\+", "/", value
+        )  # replace one or more backslashes with a forward slash
+    elif isinstance(value, list):
+        return [replace_backslash(i) for i in value]
+    elif isinstance(value, dict):
+        return {k: replace_backslash(v) for k, v in value.items()}
+    else:
+        return value
+
+
+def calculate_success_percentage(results: list[bool]) -> float:
+    success_count = results.count(True)
+    total_count = len(results)
+    if total_count == 0:
+        return 0
+    success_percentage = (success_count / total_count) * 100  # as a percentage
+    return round(success_percentage, 2)
+
+
+def get_highest_success_difficulty(data: dict) -> str:
+    highest_difficulty = None
+    highest_difficulty_level = -1
+
+    for test_name, test_data in data.items():
+        if test_data["metrics"]["success"]:
+            # Replace 'medium' with 'intermediate' for this example
+            difficulty_str = test_data["metrics"]["difficulty"]
+
+            try:
+                difficulty_enum = DifficultyLevel[difficulty_str.lower()]
+                difficulty_level = DIFFICULTY_MAP[difficulty_enum]
+
+                if difficulty_level > highest_difficulty_level:
+                    highest_difficulty = difficulty_enum
+                    highest_difficulty_level = difficulty_level
+            except KeyError:
+                print(
+                    f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'"
+                )
+
+    if highest_difficulty is not None:
+        highest_difficulty_str = highest_difficulty.name  # convert enum to string
+    else:
+        highest_difficulty_str = ""
+
+    return f"{highest_difficulty_str}: {highest_difficulty_level}"
diff --git a/agent/SuperAGI b/agent/SuperAGI
index 928051291..bd4b3def6 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit 9280512910c74bc33333e2ce7c48e47021227529
+Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 42400fd67..cde9be3e7 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 42400fd67972278e454621e7abf450a4f899a44a
+Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333
diff --git a/agent/mini-agi b/agent/mini-agi
index 6a1d08880..08764876d 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 6a1d08880c65fe3e5831243c1e1ea19acf85516c
+Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0
diff --git a/agent/smol-developer b/agent/smol-developer
index a0e9f4f39..c52b14b1d 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit a0e9f4f39e26a56b13a364be09fc58d2d85150ea
+Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d
-- 
cgit v1.2.3


From b00570f6d9e5ddce1812c0014e7593ea15033736 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <github@pwuts.nl>
Date: Wed, 12 Jul 2023 15:04:24 +0200
Subject: Fix CI warnings

---
 .github/workflows/benchmarks.yml | 4 ++--
 .github/workflows/ci.yml         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index e40abf2f6..195ebeffd 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -27,8 +27,8 @@ jobs:
         with:
           ref: master
 
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+      - name: Set up Python ${{ matrix.config.python-version }}
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.config.python-version }}
 
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 79ed16e87..a3d982137 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,7 +33,7 @@ jobs:
           repository: ${{ github.event.pull_request.head.repo.full_name }}
 
       - name: Set up Python ${{ env.min-python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ env.min-python-version }}
 
@@ -132,7 +132,7 @@ jobs:
           fi
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
 
-- 
cgit v1.2.3


From 21c0cdcb76f4da12ef413bfc59d171f4f49260d8 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <github@pwuts.nl>
Date: Wed, 12 Jul 2023 17:50:18 +0200
Subject: Disable proxy for internal pull requests (#4953)

---
 .github/workflows/ci.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a3d982137..109d2d5c1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -159,9 +159,10 @@ jobs:
           python tests/challenges/utils/build_current_score.py
         env:
           CI: true
-          PROXY: ${{ secrets.PROXY }}
-          AGENT_MODE: ${{ secrets.AGENT_MODE }}
-          AGENT_TYPE: ${{ secrets.AGENT_TYPE }}
+          PROXY: ${{ github.event_name == 'pull_request_target' && secrets.PROXY || '' }}
+          AGENT_MODE: ${{ github.event_name == 'pull_request_target' && secrets.AGENT_MODE || '' }}
+          AGENT_TYPE: ${{ github.event_name == 'pull_request_target' && secrets.AGENT_TYPE || '' }}
+          OPENAI_API_KEY: ${{ github.event_name == 'pull_request' && secrets.OPENAI_API_KEY || '' }}
           PLAIN_OUTPUT: True
 
       - name: Upload coverage reports to Codecov
-- 
cgit v1.2.3


From 3582ada3df619a454f2c12c095f24d088c9b3441 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 12 Jul 2023 10:21:20 -0700
Subject: Add links to github issues in the README and clarify run instructions
 (#4954)

---
 autogpt/core/README.md | 54 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/autogpt/core/README.md b/autogpt/core/README.md
index f7bdf2d7d..c27fa28fe 100644
--- a/autogpt/core/README.md
+++ b/autogpt/core/README.md
@@ -1,7 +1,27 @@
-# Run instructions
+# Auto-GPT Core
+
+This subpackage contains the ongoing work for the 
+[Auto-GPT Re-arch](https://github.com/Significant-Gravitas/Auto-GPT/issues/4770). It is 
+a work in progress and is not yet feature complete.  In particular, it does not yet
+have many of the Auto-GPT commands implemented and is pending ongoing work to 
+[re-incorporate vector-based memory and knowledge retrieval](https://github.com/Significant-Gravitas/Auto-GPT/issues/3536).
+
+
+## Running the Re-arch Code
 
 There are two client applications for Auto-GPT included. 
 
+Unlike the main version of Auto-GPT, the re-arch requires you to actually install Auto-GPT in your python 
+environment to run this application.  To do so, run
+
+```
+pip install -e REPOSITORY_ROOT
+```
+
+where `REPOSITORY_ROOT` is the root of the Auto-GPT repository on your machine. The `REPOSITORY_ROOT` 
+is the directory that contains the `setup.py` file and is the main, top-level directory of the repository 
+when you clone it.
+
 ## CLI Application
 
 :star2: **This is the reference application I'm working with for now** :star2: 
@@ -11,21 +31,23 @@ The first app is a straight CLI application.  I have not done anything yet to po
 - [Entry Point](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_app/cli.py)
 - [Client Application](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_app/main.py)
 
-Auto-GPT must be installed in your python environment to run this application.  To do so, run
-
-```
-pip install -e REPOSITORY_ROOT
-```
-
-where `REPOSITORY_ROOT` is the root of the Auto-GPT repository on your machine.
-
 You'll then need a settings file.  Run
 
 ```
  python REPOSITORY_ROOT/autogpt/core/runner/cli_app/cli.py make-settings
  ```
 
-This will write a file called `default_agent_settings.yaml` with all the user-modifiable configuration keys to `~/auto-gpt/default_agent_settings.yml` and make the `auto-gpt` directory in your user directory if it doesn't exist).  At a bare minimum, you'll need to set `openai.credentials.api_key` to your OpenAI API Key to run the model.
+This will write a file called `default_agent_settings.yaml` with all the user-modifiable 
+configuration keys to `~/auto-gpt/default_agent_settings.yml` and make the `auto-gpt` directory 
+in your user directory if it doesn't exist). Your user directory is located in different places 
+depending on your operating system:
+
+- On Linux, it's `/home/USERNAME`
+- On Windows, it's `C:\Users\USERNAME`
+- On Mac, it's `/Users/USERNAME`
+
+At a bare minimum, you'll need to set `openai.credentials.api_key` to your OpenAI API Key to run 
+the model.
 
 You can then run Auto-GPT with 
 
@@ -35,9 +57,15 @@ python REPOSITORY_ROOT/autogpt/core/runner/cli_app/cli.py run
 
 to launch the interaction loop.
 
-## CLI Web App
+### CLI Web App
 
-The second app is still a CLI, but it sets up a local webserver that the client application talks to rather than invoking calls to the Agent library code directly.  This application is essentially a sketch at this point as the folks who were driving it have had less time (and likely not enough clarity) to proceed.
+:warning: I am not actively developing this application.  I am primarily working with the traditional CLI app
+described above.  It is a very good place to get involved if you have web application design experience and are 
+looking to get involved in the re-arch.
+
+The second app is still a CLI, but it sets up a local webserver that the client application talks to
+rather than invoking calls to the Agent library code directly.  This application is essentially a sketch 
+at this point as the folks who were driving it have had less time (and likely not enough clarity) to proceed.
 
 - [Entry Point](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_web_app/cli.py)
 - [Client Application](https://github.com/Significant-Gravitas/Auto-GPT/blob/master/autogpt/core/runner/cli_web_app/client/client.py)
@@ -58,5 +86,3 @@ python REPOSITORY_ROOT/autogpt/core/runner/cli_web_app/cli.py client
 ```
 
 This will launch a webserver and then start the client cli application to communicate with it.
-
-:warning: I am not actively developing this application.  It is a very good place to get involved if you have web application design experience and are looking to get involved in the re-arch.
\ No newline at end of file
-- 
cgit v1.2.3


From e0b16cf4ac9a6edb83cdc67ed7d1d8161f3a8956 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 12 Jul 2023 10:54:50 -0700
Subject: Fix Smol developer and gpt engineer (#93)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 agent/gpt-engineer   | 2 +-
 agent/smol-developer | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index cde9be3e7..521d626c0 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit cde9be3e73212b3d8366a4ed149a18122bfe2333
+Subproject commit 521d626c0075ed6545f01b771757c856f8addbd6
diff --git a/agent/smol-developer b/agent/smol-developer
index c52b14b1d..aa8233925 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit c52b14b1d5b1b74d886f08d9914e7f43437f609d
+Subproject commit aa8233925090c0c9314ceef68397ab37baf17766
-- 
cgit v1.2.3


From 48ac1c91cd85960d62928d9ab9bb66a8172e8f84 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 12 Jul 2023 14:30:06 -0700
Subject: Remove dependencies cache (#94)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/ci.yml | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 45bd64fff..34eedb292 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,14 +38,6 @@ jobs:
         run: |
           curl -sSL https://install.python-poetry.org | python -
 
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
       - name: Install dependencies
         run: |
           poetry install
@@ -107,14 +99,6 @@ jobs:
         run: |
           curl -sSL https://install.python-poetry.org | python -
 
-      - name: Set up Poetry cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/.cache/pypoetry
-            .venv
-          key: ${{ runner.os }}-poetry-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/poetry.lock') }}-${{ steps.get_date.outputs.date }}
-
       - name: Install dependencies
         run: |
           poetry install
-- 
cgit v1.2.3


From 78df4915cf41e6fed0a8dc783102728e72825253 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 12 Jul 2023 14:35:12 -0700
Subject: Remove dependencies if a specific test is asked by the user (#95)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 agbenchmark/conftest.py        | 3 +++
 agbenchmark/start_benchmark.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index b91b5f9f8..32151b8ad 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -85,6 +85,7 @@ def pytest_addoption(parser: Any) -> None:
     parser.addoption("--mock", action="store_true", default=False)
     parser.addoption("--improve", action="store_true", default=False)
     parser.addoption("--maintain", action="store_true", default=False)
+    parser.addoption("--test", action="store_true", default=None)
 
 
 @pytest.fixture(autouse=True)
@@ -232,6 +233,8 @@ def pytest_collection_modifyitems(items: Any, config: Any) -> None:
         # Filter dependencies if they exist in regression data if its an improvement test
         if config.getoption("--improve"):
             dependencies = [dep for dep in dependencies if not data.get(dep, None)]
+        elif config.getoption("--test"):
+            dependencies = []
 
         categories = test_class_instance.data.category
 
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 917cd4e8a..ab2586e60 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -94,7 +94,7 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -
     pytest_args = ["-vs"]
     if test:
         print("Running specific test:", test)
-        pytest_args.extend(["-k", test])
+        pytest_args.extend(["-k", test, "--test"])
     else:
         if category:
             pytest_args.extend(["-m", category])
-- 
cgit v1.2.3


From 077e143cc2074dea8bb930c4224b4afc13e5b501 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 12 Jul 2023 18:38:48 -0700
Subject: Documentation/collate rearch notes (#4958)

* Add links to github issues in the README and clarify run instructions

* Added a new doc in the core package with architecture notes.
---
 autogpt/core/ARCHITECTURE_NOTES.md | 272 +++++++++++++++++++++++++++++++++++++
 autogpt/core/README.md             |   6 +
 2 files changed, 278 insertions(+)
 create mode 100644 autogpt/core/ARCHITECTURE_NOTES.md

diff --git a/autogpt/core/ARCHITECTURE_NOTES.md b/autogpt/core/ARCHITECTURE_NOTES.md
new file mode 100644
index 000000000..b9fd2021b
--- /dev/null
+++ b/autogpt/core/ARCHITECTURE_NOTES.md
@@ -0,0 +1,272 @@
+# Re-architecture Notes
+
+## Key Documents
+
+- [Planned Agent Workflow](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ)
+- [Original Architecture Diagram](https://www.figma.com/file/fwdj44tPR7ArYtnGGUKknw/Modular-Architecture?type=whiteboard&node-id=0-1) - This is sadly well out of date at this point.
+- [Kanban](https://github.com/orgs/Significant-Gravitas/projects/1/views/1?filterQuery=label%3Are-arch)
+
+## The Motivation
+
+The `master` branch of Auto-GPT is an organically grown amalgamation of many thoughts 
+and ideas about agent-driven autonomous systems.  It lacks clear abstraction boundaries, 
+has issues of global state and poorly encapsulated state, and is generally just hard to 
+make effective changes to.  Mainly it's just a system that's hard to make changes to.  
+And research in the field is moving fast, so we want to be able to try new ideas 
+quickly.  
+
+## Initial Planning
+
+A large group of maintainers and contributors met do discuss the architectural 
+challenges associated with the existing codebase. Many much-desired features (building 
+new user interfaces, enabling project-specific agents, enabling multi-agent systems) 
+are bottlenecked by the global state in the system. We discussed the tradeoffs between 
+an incremental system transition and a big breaking version change and decided to go 
+for the breaking version change. We justified this by saying:
+
+- We can maintain, in essence, the same user experience as now even with a radical 
+  restructuring of the codebase
+- Our developer audience is struggling to use the existing codebase to build 
+  applications and libraries of their own, so this breaking change will largely be 
+  welcome.
+
+## Primary Goals
+
+- Separate the AutoGPT application code from the library code.
+- Remove global state from the system
+- Allow for multiple agents per user (with facilities for running simultaneously)
+- Create a serializable representation of an Agent
+- Encapsulate the core systems in abstractions with clear boundaries.
+
+## Secondary goals
+
+- Use existing tools to ditch any unneccesary cruft in the codebase (document loading, 
+  json parsing, anything easier to replace than to port).
+- Bring in the [core agent loop updates](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ)
+  being developed simultaneously by @Pwuts 
+
+# The Agent Subsystems
+
+## Configuration
+
+We want a lot of things from a configuration system. We lean heavily on it in the 
+`master` branch to allow several parts of the system to communicate with each other.  
+[Recent work](https://github.com/Significant-Gravitas/Auto-GPT/pull/4737) has made it 
+so that the config is no longer a singleton object that is materialized from the import 
+state, but it's still treated as a 
+[god object](https://en.wikipedia.org/wiki/God_object) containing all information about
+the system and _critically_ allowing any system to reference configuration information 
+about other parts of the system.  
+
+### What we want
+
+- It should still be reasonable to collate the entire system configuration in a 
+  sensible way.
+- The configuration should be validatable and validated.
+- The system configuration should be a _serializable_ representation of an `Agent`.
+- The configuration system should provide a clear (albeit very low-level) contract 
+  about user-configurable aspects of the system.
+- The configuration should reasonably manage default values and user-provided overrides.
+- The configuration system needs to handle credentials in a reasonable way.
+- The configuration should be the representation of some amount of system state, like 
+  api budgets and resource usage.  These aspects are recorded in the configuration and 
+  updated by the system itself.
+- Agent systems should have encapsulated views of the configuration.  E.g. the memory 
+  system should know about memory configuration but nothing about command configuration.
+
+## Workspace
+
+There are two ways to think about the workspace:
+
+- The workspace is a scratch space for an agent where it can store files, write code, 
+  and do pretty much whatever else it likes.
+- The workspace is, at any given point in time, the single source of truth for what an 
+  agent is.  It contains the serializable state (the configuration) as well as all 
+  other working state (stored files, databases, memories, custom code).  
+
+In the existing system there is **one** workspace.  And because the workspace holds so 
+much agent state, that means a user can only work with one agent at a time.
+
+## Memory
+
+The memory system has been under extremely active development. 
+See [#3536](https://github.com/Significant-Gravitas/Auto-GPT/issues/3536) and 
+[#4208](https://github.com/Significant-Gravitas/Auto-GPT/pull/4208) for discussion and 
+work in the `master` branch.  The TL;DR is 
+that we noticed a couple of months ago that the `Agent` performed **worse** with 
+permanent memory than without it.  Since then the knowledge storage and retrieval 
+system has been [redesigned](https://whimsical.com/memory-system-8Ae6x6QkjDwQAUe9eVJ6w1) 
+and partially implemented in the `master` branch.
+
+## Planning/Prompt-Engineering
+
+The planning system is the system that translates user desires/agent intentions into
+language model prompts.  In the course of development, it has become pretty clear 
+that `Planning` is the wrong name for this system
+
+### What we want
+
+- It should be incredibly obvious what's being passed to a language model, when it's
+  being passed, and what the language model response is. The landscape of language 
+  model research is developing very rapidly, so building complex abstractions between 
+  users/contributors and the language model interactions is going to make it very 
+  difficult for us to nimbly respond to new research developments.
+- Prompt-engineering should ideally be exposed in a parameterizeable way to users. 
+- We should, where possible, leverage OpenAI's new  
+  [function calling api](https://openai.com/blog/function-calling-and-other-api-updates) 
+  to get outputs in a standard machine-readable format and avoid the deep pit of 
+  parsing json (and fixing unparsable json).
+
+### Planning Strategies
+
+The [new agent workflow](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ) 
+has many, many interaction points for language models.  We really would like to not 
+distribute prompt templates and raw strings all through the system. The re-arch solution 
+is to encapsulate language model interactions into planning strategies. 
+These strategies are defined by 
+
+- The `LanguageModelClassification` they use (`FAST` or `SMART`)
+- A function `build_prompt` that takes strategy specific arguments and constructs a 
+  `LanguageModelPrompt` (a simple container for lists of messages and functions to
+  pass to the language model)
+- A function `parse_content` that parses the response content (a dict) into a better 
+  formatted dict.  Contracts here are intentionally loose and will tighten once we have 
+  at least one other language model provider.
+
+## Resources
+
+Resources are kinds of services we consume from external APIs.  They may have associated 
+credentials and costs we need to manage.  Management of those credentials is implemented 
+as manipulation of the resource configuration.  We have two categories of resources 
+currently
+
+- AI/ML model providers (including language model providers and embedding model providers, ie OpenAI)
+- Memory providers (e.g. Pinecone, Weaviate, ChromaDB, etc.)
+
+### What we want
+
+- Resource abstractions should provide a common interface to different service providers 
+  for a particular kind of service.  
+- Resource abstractions should manipulate the configuration to manage their credentials 
+  and budget/accounting.
+- Resource abstractions should be composable over an API (e.g. I should be able to make 
+  an OpenAI provider that is both a LanguageModelProvider and an EmbeddingModelProvider
+  and use it wherever I need those services).
+
+## Abilities
+
+Along with planning and memory usage, abilities are one of the major augmentations of 
+augmented language models.  They allow us to expand the scope of what language models
+can do by hooking them up to code they can execute to obtain new knowledge or influence
+the world.  
+
+### What we want
+
+- Abilities should have an extremely clear interface that users can write to.
+- Abilities should have an extremely clear interface that a language model can 
+  understand
+- Abilities should be declarative about their dependencies so the system can inject them
+- Abilities should be executable (where sensible) in an async run loop.
+- Abilities should be not have side effects unless those side effects are clear in 
+  their representation to an agent (e.g. the BrowseWeb ability shouldn't write a file,
+  but the WriteFile ability can).
+
+## Plugins
+
+Users want to add lots of features that we don't want to support as first-party. 
+Or solution to this is a plugin system to allow users to plug in their functionality or
+to construct their agent from a public plugin marketplace.  Our primary concern in the
+re-arch is to build a stateless plugin service interface and a simple implementation 
+that can load plugins from installed packages or from zip files.  Future efforts will 
+expand this system to allow plugins to load from a marketplace or some other kind 
+of service.
+
+### What is a Plugin
+
+Plugins are a kind of garbage term.  They refer to a number of things.
+
+- New commands for the agent to execute.  This is the most common usage.
+- Replacements for entire subsystems like memory or language model providers
+- Application plugins that do things like send emails or communicate via whatsapp
+- The repositories contributors create that may themselves have multiple plugins in them.
+
+### Usage in the existing system
+
+The current plugin system is _hook-based_.  This means plugins don't correspond to 
+kinds of objects in the system, but rather to times in the system at which we defer 
+execution to them.  The main advantage of this setup is that user code can hijack 
+pretty much any behavior of the agent by injecting code that supercedes the normal 
+agent execution.  The disadvantages to this approach are numerous:
+
+- We have absolutely no mechanisms to enforce any security measures because the threat 
+  surface is everything.
+- We cannot reason about agent behavior in a cohesive way because control flow can be
+  ceded to user code at pretty much any point and arbitrarily change or break the
+  agent behavior
+- The interface for designing a plugin is kind of terrible and difficult to standardize
+- The hook based implementation means we couple ourselves to a particular flow of 
+  control (or otherwise risk breaking plugin behavior).  E.g. many of the hook targets
+  in the [old workflow](https://whimsical.com/agent-workflow-VAzeKcup3SR7awpNZJKTyK) 
+  are not present or mean something entirely different in the 
+  [new workflow](https://whimsical.com/agent-workflow-v2-NmnTQ8R7sVo7M3S43XgXmZ).
+- Etc.
+
+### What we want
+
+- A concrete definition of a plugin that is narrow enough in scope that we can define 
+  it well and reason about how it will work in the system.
+- A set of abstractions that let us define a plugin by its storage format and location 
+- A service interface that knows how to parse the plugin abstractions and turn them 
+  into concrete classes and objects.
+
+
+## Some Notes on how and why we'll use OO in this project
+
+First and foremost, Python itself is an object-oriented language. It's 
+underlying [data model](https://docs.python.org/3/reference/datamodel.html) is built 
+with object-oriented programming in mind. It offers useful tools like abstract base 
+classes to communicate interfaces to developers who want to, e.g., write plugins, or 
+help work on implementations. If we were working in a different language that offered 
+different tools, we'd use a different paradigm.
+
+While many things are classes in the re-arch, they are not classes in the same way. 
+There are three kinds of things (roughly) that are written as classes in the re-arch:
+1.  **Configuration**:  Auto-GPT has *a lot* of configuration.  This configuration 
+    is *data* and we use **[Pydantic](https://docs.pydantic.dev/latest/)** to manage it as 
+    pydantic is basically industry standard for this stuff. It provides runtime validation 
+    for all the configuration and allows us to easily serialize configuration to both basic 
+    python types (dicts, lists, and primatives) as well as serialize to json, which is 
+    important for us being able to put representations of agents 
+    [on the wire](https://en.wikipedia.org/wiki/Wire_protocol) for web applications and 
+    agent-to-agent communication. *These are essentially 
+    [structs](https://en.wikipedia.org/wiki/Struct_(C_programming_language)) rather than 
+    traditional classes.*
+2.  **Internal Data**: Very similar to configuration, Auto-GPT passes around boatloads 
+    of internal data.  We are interacting with language models and language model APIs 
+    which means we are handling lots of *structured* but *raw* text.  Here we also 
+    leverage **pydantic** to both *parse* and *validate* the internal data and also to 
+    give us concrete types which we can use static type checkers to validate against 
+    and discover problems before they show up as bugs at runtime. *These are 
+    essentially [structs](https://en.wikipedia.org/wiki/Struct_(C_programming_language)) 
+    rather than traditional classes.*
+3.  **System Interfaces**: This is our primary traditional use of classes in the 
+    re-arch.  We have a bunch of systems. We want many of those systems to have 
+    alternative implementations (e.g. via plugins). We use abstract base classes to 
+    define interfaces to communicate with people who might want to provide those 
+    plugins. We provide a single concrete implementation of most of those systems as a 
+    subclass of the interface. This should not be controversial.
+
+The approach is consistent with 
+[prior](https://github.com/Significant-Gravitas/Auto-GPT/issues/2458)
+[work](https://github.com/Significant-Gravitas/Auto-GPT/pull/2442) done by other 
+maintainers in this direction.
+
+From an organization standpoint, OO programming is by far the most popular programming 
+paradigm (especially for Python). It's the one most often taught in programming classes
+and the one with the most available online training for people interested in 
+contributing.   
+
+Finally, and importantly, we scoped the plan and initial design of the re-arch as a 
+large group of maintainers and collaborators early on. This is consistent with the 
+design we chose and no-one offered alternatives.
+ 
\ No newline at end of file
diff --git a/autogpt/core/README.md b/autogpt/core/README.md
index c27fa28fe..e5bbc108e 100644
--- a/autogpt/core/README.md
+++ b/autogpt/core/README.md
@@ -6,6 +6,12 @@ a work in progress and is not yet feature complete.  In particular, it does not
 have many of the Auto-GPT commands implemented and is pending ongoing work to 
 [re-incorporate vector-based memory and knowledge retrieval](https://github.com/Significant-Gravitas/Auto-GPT/issues/3536).
 
+## [Overview](ARCHITECTURE_NOTES.md)
+
+The Auto-GPT Re-arch is a re-implementation of the Auto-GPT agent that is designed to be more modular,
+more extensible, and more maintainable than the original Auto-GPT agent.  It is also designed to be
+more accessible to new developers and to be easier to contribute to. The re-arch is a work in progress
+and is not yet feature complete.  It is also not yet ready for production use.
 
 ## Running the Re-arch Code
 
-- 
cgit v1.2.3


From 4177c37b51d2f2c8f19c929cadd2609262682605 Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Wed, 12 Jul 2023 19:36:00 -0700
Subject: Refactor/move functions in app to agent (#4957)

* Add links to github issues in the README and clarify run instructions

* Move things only used by the agent out of app.py and into the agent module

* Fix busted dynamic import
---
 autogpt/agent/agent.py             |  95 ++++++++++++++++++++++++++++++-
 autogpt/app.py                     | 114 -------------------------------------
 autogpt/main.py                    |   1 -
 tests/unit/test_agent.py           |  57 +++++++------------
 tests/unit/test_execute_command.py |  23 --------
 5 files changed, 111 insertions(+), 179 deletions(-)
 delete mode 100644 autogpt/app.py
 delete mode 100644 tests/unit/test_execute_command.py

diff --git a/autogpt/agent/agent.py b/autogpt/agent/agent.py
index 88b3fa809..316cc4d44 100644
--- a/autogpt/agent/agent.py
+++ b/autogpt/agent/agent.py
@@ -9,6 +9,7 @@ from colorama import Fore, Style
 from autogpt.config import Config
 from autogpt.config.ai_config import AIConfig
 from autogpt.json_utils.utilities import extract_json_from_response, validate_json
+from autogpt.llm import ChatModelResponse
 from autogpt.llm.chat import chat_with_ai
 from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS
 from autogpt.llm.utils import count_string_tokens
@@ -86,9 +87,6 @@ class Agent:
         self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens
 
     def start_interaction_loop(self):
-        # Avoid circular imports
-        from autogpt.app import execute_command, extract_command
-
         # Interaction Loop
         self.cycle_count = 0
         command_name = None
@@ -307,3 +305,94 @@ class Agent:
                 logger.typewriter_log(
                     "SYSTEM: ", Fore.YELLOW, "Unable to execute command"
                 )
+
+
+def extract_command(
+    assistant_reply_json: dict, assistant_reply: ChatModelResponse, config: Config
+):
+    """Parse the response and return the command name and arguments
+
+    Args:
+        assistant_reply_json (dict): The response object from the AI
+        assistant_reply (ChatModelResponse): The model response from the AI
+        config (Config): The config object
+
+    Returns:
+        tuple: The command name and arguments
+
+    Raises:
+        json.decoder.JSONDecodeError: If the response is not valid JSON
+
+        Exception: If any other error occurs
+    """
+    if config.openai_functions:
+        if assistant_reply.function_call is None:
+            return "Error:", "No 'function_call' in assistant reply"
+        assistant_reply_json["command"] = {
+            "name": assistant_reply.function_call.name,
+            "args": json.loads(assistant_reply.function_call.arguments),
+        }
+    try:
+        if "command" not in assistant_reply_json:
+            return "Error:", "Missing 'command' object in JSON"
+
+        if not isinstance(assistant_reply_json, dict):
+            return (
+                "Error:",
+                f"The previous message sent was not a dictionary {assistant_reply_json}",
+            )
+
+        command = assistant_reply_json["command"]
+        if not isinstance(command, dict):
+            return "Error:", "'command' object is not a dictionary"
+
+        if "name" not in command:
+            return "Error:", "Missing 'name' field in 'command' object"
+
+        command_name = command["name"]
+
+        # Use an empty dictionary if 'args' field is not present in 'command' object
+        arguments = command.get("args", {})
+
+        return command_name, arguments
+    except json.decoder.JSONDecodeError:
+        return "Error:", "Invalid JSON"
+    # All other errors, return "Error: + error message"
+    except Exception as e:
+        return "Error:", str(e)
+
+
+def execute_command(
+    command_name: str,
+    arguments: dict[str, str],
+    agent: Agent,
+):
+    """Execute the command and return the result
+
+    Args:
+        command_name (str): The name of the command to execute
+        arguments (dict): The arguments for the command
+        agent (Agent): The agent that is executing the command
+
+    Returns:
+        str: The result of the command
+    """
+    try:
+        # Execute a native command with the same name or alias, if it exists
+        if command := agent.command_registry.get_command(command_name):
+            return command(**arguments, agent=agent)
+
+        # Handle non-native commands (e.g. from plugins)
+        for command in agent.ai_config.prompt_generator.commands:
+            if (
+                command_name == command["label"].lower()
+                or command_name == command["name"].lower()
+            ):
+                return command["function"](**arguments)
+
+        raise RuntimeError(
+            f"Cannot execute '{command_name}': unknown command."
+            " Do not try to use this command again."
+        )
+    except Exception as e:
+        return f"Error: {str(e)}"
diff --git a/autogpt/app.py b/autogpt/app.py
deleted file mode 100644
index ea5072f81..000000000
--- a/autogpt/app.py
+++ /dev/null
@@ -1,114 +0,0 @@
-""" Command and Control """
-import json
-from typing import Dict
-
-from autogpt.agent.agent import Agent
-from autogpt.config import Config
-from autogpt.llm import ChatModelResponse
-
-
-def is_valid_int(value: str) -> bool:
-    """Check if the value is a valid integer
-
-    Args:
-        value (str): The value to check
-
-    Returns:
-        bool: True if the value is a valid integer, False otherwise
-    """
-    try:
-        int(value)
-        return True
-    except ValueError:
-        return False
-
-
-def extract_command(
-    assistant_reply_json: Dict, assistant_reply: ChatModelResponse, config: Config
-):
-    """Parse the response and return the command name and arguments
-
-    Args:
-        assistant_reply_json (dict): The response object from the AI
-        assistant_reply (ChatModelResponse): The model response from the AI
-        config (Config): The config object
-
-    Returns:
-        tuple: The command name and arguments
-
-    Raises:
-        json.decoder.JSONDecodeError: If the response is not valid JSON
-
-        Exception: If any other error occurs
-    """
-    if config.openai_functions:
-        if assistant_reply.function_call is None:
-            return "Error:", "No 'function_call' in assistant reply"
-        assistant_reply_json["command"] = {
-            "name": assistant_reply.function_call.name,
-            "args": json.loads(assistant_reply.function_call.arguments),
-        }
-    try:
-        if "command" not in assistant_reply_json:
-            return "Error:", "Missing 'command' object in JSON"
-
-        if not isinstance(assistant_reply_json, dict):
-            return (
-                "Error:",
-                f"The previous message sent was not a dictionary {assistant_reply_json}",
-            )
-
-        command = assistant_reply_json["command"]
-        if not isinstance(command, dict):
-            return "Error:", "'command' object is not a dictionary"
-
-        if "name" not in command:
-            return "Error:", "Missing 'name' field in 'command' object"
-
-        command_name = command["name"]
-
-        # Use an empty dictionary if 'args' field is not present in 'command' object
-        arguments = command.get("args", {})
-
-        return command_name, arguments
-    except json.decoder.JSONDecodeError:
-        return "Error:", "Invalid JSON"
-    # All other errors, return "Error: + error message"
-    except Exception as e:
-        return "Error:", str(e)
-
-
-def execute_command(
-    command_name: str,
-    arguments: dict[str, str],
-    agent: Agent,
-):
-    """Execute the command and return the result
-
-    Args:
-        command_name (str): The name of the command to execute
-        arguments (dict): The arguments for the command
-        agent (Agent): The agent that is executing the command
-
-    Returns:
-        str: The result of the command
-    """
-    try:
-        # Execute a native command with the same name or alias, if it exists
-        if command := agent.command_registry.get_command(command_name):
-            return command(**arguments, agent=agent)
-
-        # Handle non-native commands (e.g. from plugins)
-        for command in agent.ai_config.prompt_generator.commands:
-            if (
-                command_name == command["label"].lower()
-                or command_name == command["name"].lower()
-            ):
-                return command["function"](**arguments)
-
-        raise RuntimeError(
-            f"Cannot execute '{command_name}': unknown command."
-            " Do not try to use this command again."
-        )
-    except Exception as e:
-        return f"Error: {str(e)}"
diff --git a/autogpt/main.py b/autogpt/main.py
index 08ac4b400..4ef3fc949 100644
--- a/autogpt/main.py
+++ b/autogpt/main.py
@@ -28,7 +28,6 @@ COMMAND_CATEGORIES = [
     "autogpt.commands.file_operations",
     "autogpt.commands.web_search",
     "autogpt.commands.web_selenium",
-    "autogpt.app",
     "autogpt.commands.task_statuses",
 ]
 
diff --git a/tests/unit/test_agent.py b/tests/unit/test_agent.py
index 3fb896bad..351454be0 100644
--- a/tests/unit/test_agent.py
+++ b/tests/unit/test_agent.py
@@ -1,46 +1,27 @@
-from unittest.mock import MagicMock
-
-import pytest
-
-from autogpt.agent import Agent
-from autogpt.config import AIConfig
-from autogpt.config.config import Config
-
-
-@pytest.fixture
-def agent(config: Config):
-    ai_name = "Test AI"
-    memory = MagicMock()
-    next_action_count = 0
-    command_registry = MagicMock()
-    ai_config = AIConfig(ai_name=ai_name)
-    system_prompt = "System prompt"
-    triggering_prompt = "Triggering prompt"
-    workspace_directory = "workspace_directory"
-
-    agent = Agent(
-        ai_name=ai_name,
-        memory=memory,
-        next_action_count=next_action_count,
-        command_registry=command_registry,
-        ai_config=ai_config,
-        config=config,
-        system_prompt=system_prompt,
-        triggering_prompt=triggering_prompt,
-        workspace_directory=workspace_directory,
-    )
-    return agent
+from autogpt.agent.agent import Agent, execute_command
 
 
 def test_agent_initialization(agent: Agent):
-    assert agent.ai_name == "Test AI"
-    assert agent.memory == agent.memory
+    assert agent.ai_name == "Base"
     assert agent.history.messages == []
     assert agent.next_action_count == 0
-    assert agent.command_registry == agent.command_registry
-    assert agent.ai_config == agent.ai_config
-    assert agent.system_prompt == "System prompt"
-    assert agent.triggering_prompt == "Triggering prompt"
+
+
+def test_execute_command_plugin(agent: Agent):
+    """Test that executing a command that came from a plugin works as expected"""
+    command_name = "check_plan"
+    agent.ai_config.prompt_generator.add_command(
+        command_name,
+        "Read the plan.md with the next goals to achieve",
+        {},
+        lambda: "hi",
+    )
+    command_result = execute_command(
+        command_name=command_name,
+        arguments={},
+        agent=agent,
+    )
+    assert command_result == "hi"
 
 
 # More test methods can be added for specific agent interactions
diff --git a/tests/unit/test_execute_command.py b/tests/unit/test_execute_command.py
deleted file mode 100644
index 21fb0b66e..000000000
--- a/tests/unit/test_execute_command.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from autogpt.agent import Agent
-from autogpt.app import execute_command
-
-
-def check_plan():
-    return "hi"
-
-
-def test_execute_command_plugin(agent: Agent):
-    """Test that executing a command that came from a plugin works as expected"""
-    command_name = "check_plan"
-    agent.ai_config.prompt_generator.add_command(
-        command_name,
-        "Read the plan.md with the next goals to achieve",
-        {},
-        check_plan,
-    )
-    command_result = execute_command(
-        command_name=command_name,
-        arguments={},
-        agent=agent,
-    )
-    assert command_result == "hi"
-- 
cgit v1.2.3


From c9adedf746758817913f334098110350fb21f8ce Mon Sep 17 00:00:00 2001
From: James Collins <collijk@uw.edu>
Date: Thu, 13 Jul 2023 07:31:49 -0700
Subject: Refactor/rename agent subpackage to agents (#4961)

* Add links to github issues in the README and clarify run instructions

* Rename agent subpackage to agents

* Revert all unwanted changes

* Use relative import in `agents/__init__.py`

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
---
 autogpt/agent/__init__.py                          |   3 -
 autogpt/agent/agent.py                             | 398 ---------------------
 autogpt/agents/__init__.py                         |   3 +
 autogpt/agents/agent.py                            | 398 +++++++++++++++++++++
 autogpt/commands/decorators.py                     |   2 +-
 autogpt/commands/execute_code.py                   |   2 +-
 autogpt/commands/file_operations.py                |   2 +-
 autogpt/commands/git_operations.py                 |   2 +-
 autogpt/commands/image_gen.py                      |   2 +-
 autogpt/commands/task_statuses.py                  |   2 +-
 autogpt/commands/web_search.py                     |   2 +-
 autogpt/commands/web_selenium.py                   |   2 +-
 autogpt/llm/chat.py                                |   2 +-
 autogpt/main.py                                    |   2 +-
 autogpt/memory/message_history.py                  |   2 +-
 benchmarks.py                                      |   2 +-
 .../debug_code/test_debug_code_challenge_a.py      |   2 +-
 tests/conftest.py                                  |   2 +-
 tests/integration/agent_factory.py                 |   2 +-
 tests/integration/test_execute_code.py             |   2 +-
 tests/integration/test_image_gen.py                |   2 +-
 tests/integration/test_web_selenium.py             |   2 +-
 tests/unit/test_agent.py                           |   2 +-
 tests/unit/test_file_operations.py                 |   2 +-
 tests/unit/test_git_commands.py                    |   2 +-
 tests/unit/test_message_history.py                 |   2 +-
 tests/unit/test_web_search.py                      |   2 +-
 27 files changed, 424 insertions(+), 424 deletions(-)
 delete mode 100644 autogpt/agent/__init__.py
 delete mode 100644 autogpt/agent/agent.py
 create mode 100644 autogpt/agents/__init__.py
 create mode 100644 autogpt/agents/agent.py

diff --git a/autogpt/agent/__init__.py b/autogpt/agent/__init__.py
deleted file mode 100644
index 90d1148c2..000000000
--- a/autogpt/agent/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from autogpt.agent.agent import Agent
-
-__all__ = ["Agent"]
diff --git a/autogpt/agent/agent.py b/autogpt/agent/agent.py
deleted file mode 100644
index 316cc4d44..000000000
--- a/autogpt/agent/agent.py
+++ /dev/null
@@ -1,398 +0,0 @@
-import json
-import signal
-import sys
-from datetime import datetime
-from pathlib import Path
-
-from colorama import Fore, Style
-
-from autogpt.config import Config
-from autogpt.config.ai_config import AIConfig
-from autogpt.json_utils.utilities import extract_json_from_response, validate_json
-from autogpt.llm import ChatModelResponse
-from autogpt.llm.chat import chat_with_ai
-from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS
-from autogpt.llm.utils import count_string_tokens
-from autogpt.logs import (
-    FULL_MESSAGE_HISTORY_FILE_NAME,
-    NEXT_ACTION_FILE_NAME,
-    USER_INPUT_FILE_NAME,
-    LogCycleHandler,
-    logger,
-    print_assistant_thoughts,
-    remove_ansi_escape,
-)
-from autogpt.memory.message_history import MessageHistory
-from autogpt.memory.vector import VectorMemory
-from autogpt.models.command_registry import CommandRegistry
-from autogpt.speech import say_text
-from autogpt.spinner import Spinner
-from autogpt.utils import clean_input
-from autogpt.workspace import Workspace
-
-
-class Agent:
-    """Agent class for interacting with Auto-GPT.
-
-    Attributes:
-        ai_name: The name of the agent.
-        memory: The memory object to use.
-        next_action_count: The number of actions to execute.
-        system_prompt: The system prompt is the initial prompt that defines everything
-          the AI needs to know to achieve its task successfully.
-        Currently, the dynamic and customizable information in the system prompt are
-          ai_name, description and goals.
-
-        triggering_prompt: The last sentence the AI will see before answering.
-            For Auto-GPT, this prompt is:
-            Determine exactly one command to use, and respond using the format specified
-              above:
-            The triggering prompt is not part of the system prompt because between the
-              system prompt and the triggering
-            prompt we have contextual information that can distract the AI and make it
-              forget that its goal is to find the next task to achieve.
-            SYSTEM PROMPT
-            CONTEXTUAL INFORMATION (memory, previous conversations, anything relevant)
-            TRIGGERING PROMPT
-
-        The triggering prompt reminds the AI about its short term meta task
-        (defining the next task)
-    """
-
-    def __init__(
-        self,
-        ai_name: str,
-        memory: VectorMemory,
-        next_action_count: int,
-        command_registry: CommandRegistry,
-        ai_config: AIConfig,
-        system_prompt: str,
-        triggering_prompt: str,
-        workspace_directory: str | Path,
-        config: Config,
-    ):
-        self.ai_name = ai_name
-        self.memory = memory
-        self.history = MessageHistory.for_model(config.smart_llm, agent=self)
-        self.next_action_count = next_action_count
-        self.command_registry = command_registry
-        self.config = config
-        self.ai_config = ai_config
-        self.system_prompt = system_prompt
-        self.triggering_prompt = triggering_prompt
-        self.workspace = Workspace(workspace_directory, config.restrict_to_workspace)
-        self.created_at = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self.cycle_count = 0
-        self.log_cycle_handler = LogCycleHandler()
-        self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens
-
-    def start_interaction_loop(self):
-        # Interaction Loop
-        self.cycle_count = 0
-        command_name = None
-        arguments = None
-        user_input = ""
-
-        # Signal handler for interrupting y -N
-        def signal_handler(signum, frame):
-            if self.next_action_count == 0:
-                sys.exit()
-            else:
-                print(
-                    Fore.RED
-                    + "Interrupt signal received. Stopping continuous command execution."
-                    + Style.RESET_ALL
-                )
-                self.next_action_count = 0
-
-        signal.signal(signal.SIGINT, signal_handler)
-
-        while True:
-            # Discontinue if continuous limit is reached
-            self.cycle_count += 1
-            self.log_cycle_handler.log_count_within_cycle = 0
-            self.log_cycle_handler.log_cycle(
-                self.ai_config.ai_name,
-                self.created_at,
-                self.cycle_count,
-                [m.raw() for m in self.history],
-                FULL_MESSAGE_HISTORY_FILE_NAME,
-            )
-            if (
-                self.config.continuous_mode
-                and self.config.continuous_limit > 0
-                and self.cycle_count > self.config.continuous_limit
-            ):
-                logger.typewriter_log(
-                    "Continuous Limit Reached: ",
-                    Fore.YELLOW,
-                    f"{self.config.continuous_limit}",
-                )
-                break
-            # Send message to AI, get response
-            with Spinner("Thinking... ", plain_output=self.config.plain_output):
-                assistant_reply = chat_with_ai(
-                    self.config,
-                    self,
-                    self.system_prompt,
-                    self.triggering_prompt,
-                    self.smart_token_limit,
-                    self.config.smart_llm,
-                )
-
-            try:
-                assistant_reply_json = extract_json_from_response(
-                    assistant_reply.content
-                )
-                validate_json(assistant_reply_json, self.config)
-            except json.JSONDecodeError as e:
-                logger.error(f"Exception while validating assistant reply JSON: {e}")
-                assistant_reply_json = {}
-
-            for plugin in self.config.plugins:
-                if not plugin.can_handle_post_planning():
-                    continue
-                assistant_reply_json = plugin.post_planning(assistant_reply_json)
-
-            # Print Assistant thoughts
-            if assistant_reply_json != {}:
-                # Get command name and arguments
-                try:
-                    print_assistant_thoughts(
-                        self.ai_name, assistant_reply_json, self.config
-                    )
-                    command_name, arguments = extract_command(
-                        assistant_reply_json, assistant_reply, self.config
-                    )
-                    if self.config.speak_mode:
-                        say_text(f"I want to execute {command_name}", self.config)
-
-                except Exception as e:
-                    logger.error("Error: \n", str(e))
-            self.log_cycle_handler.log_cycle(
-                self.ai_config.ai_name,
-                self.created_at,
-                self.cycle_count,
-                assistant_reply_json,
-                NEXT_ACTION_FILE_NAME,
-            )
-
-            # First log new-line so user can differentiate sections better in console
-            logger.typewriter_log("\n")
-            logger.typewriter_log(
-                "NEXT ACTION: ",
-                Fore.CYAN,
-                f"COMMAND = {Fore.CYAN}{remove_ansi_escape(command_name)}{Style.RESET_ALL}  "
-                f"ARGUMENTS = {Fore.CYAN}{arguments}{Style.RESET_ALL}",
-            )
-
-            if not self.config.continuous_mode and self.next_action_count == 0:
-                # ### GET USER AUTHORIZATION TO EXECUTE COMMAND ###
-                # Get key press: Prompt the user to press enter to continue or escape
-                # to exit
-                self.user_input = ""
-                logger.info(
-                    f"Enter '{self.config.authorise_key}' to authorise command, "
-                    f"'{self.config.authorise_key} -N' to run N continuous commands, "
-                    f"'{self.config.exit_key}' to exit program, or enter feedback for "
-                    f"{self.ai_name}..."
-                )
-                while True:
-                    if self.config.chat_messages_enabled:
-                        console_input = clean_input(
-                            self.config, "Waiting for your response..."
-                        )
-                    else:
-                        console_input = clean_input(
-                            self.config, Fore.MAGENTA + "Input:" + Style.RESET_ALL
-                        )
-                    if console_input.lower().strip() == self.config.authorise_key:
-                        user_input = "GENERATE NEXT COMMAND JSON"
-                        break
-                    elif console_input.lower().strip() == "":
-                        logger.warn("Invalid input format.")
-                        continue
-                    elif console_input.lower().startswith(
-                        f"{self.config.authorise_key} -"
-                    ):
-                        try:
-                            self.next_action_count = abs(
-                                int(console_input.split(" ")[1])
-                            )
-                            user_input = "GENERATE NEXT COMMAND JSON"
-                        except ValueError:
-                            logger.warn(
-                                f"Invalid input format. Please enter '{self.config.authorise_key} -n' "
-                                "where n is the number of continuous tasks."
-                            )
-                            continue
-                        break
-                    elif console_input.lower() == self.config.exit_key:
-                        user_input = "EXIT"
-                        break
-                    else:
-                        user_input = console_input
-                        command_name = "human_feedback"
-                        self.log_cycle_handler.log_cycle(
-                            self.ai_config.ai_name,
-                            self.created_at,
-                            self.cycle_count,
-                            user_input,
-                            USER_INPUT_FILE_NAME,
-                        )
-                        break
-
-                if user_input == "GENERATE NEXT COMMAND JSON":
-                    logger.typewriter_log(
-                        "-=-=-=-=-=-=-= COMMAND AUTHORISED BY USER -=-=-=-=-=-=-=",
-                        Fore.MAGENTA,
-                        "",
-                    )
-                elif user_input == "EXIT":
-                    logger.info("Exiting...")
-                    break
-            else:
-                # First log new-line so user can differentiate sections better in console
-                logger.typewriter_log("\n")
-                # Print authorized commands left value
-                logger.typewriter_log(
-                    f"{Fore.CYAN}AUTHORISED COMMANDS LEFT: {Style.RESET_ALL}{self.next_action_count}"
-                )
-
-            # Execute command
-            if command_name is not None and command_name.lower().startswith("error"):
-                result = f"Could not execute command: {arguments}"
-            elif command_name == "human_feedback":
-                result = f"Human feedback: {user_input}"
-            else:
-                for plugin in self.config.plugins:
-                    if not plugin.can_handle_pre_command():
-                        continue
-                    command_name, arguments = plugin.pre_command(
-                        command_name, arguments
-                    )
-                command_result = execute_command(
-                    command_name=command_name,
-                    arguments=arguments,
-                    agent=self,
-                )
-                result = f"Command {command_name} returned: " f"{command_result}"
-
-                result_tlength = count_string_tokens(
-                    str(command_result), self.config.smart_llm
-                )
-                memory_tlength = count_string_tokens(
-                    str(self.history.summary_message()), self.config.smart_llm
-                )
-                if result_tlength + memory_tlength + 600 > self.smart_token_limit:
-                    result = f"Failure: command {command_name} returned too much output. \
-                        Do not execute this command again with the same arguments."
-
-                for plugin in self.config.plugins:
-                    if not plugin.can_handle_post_command():
-                        continue
-                    result = plugin.post_command(command_name, result)
-                if self.next_action_count > 0:
-                    self.next_action_count -= 1
-
-            # Check if there's a result from the command append it to the message
-            # history
-            if result is not None:
-                self.history.add("system", result, "action_result")
-                logger.typewriter_log("SYSTEM: ", Fore.YELLOW, result)
-            else:
-                self.history.add("system", "Unable to execute command", "action_result")
-                logger.typewriter_log(
-                    "SYSTEM: ", Fore.YELLOW, "Unable to execute command"
-                )
-
-
-def extract_command(
-    assistant_reply_json: dict, assistant_reply: ChatModelResponse, config: Config
-):
-    """Parse the response and return the command name and arguments
-
-    Args:
-        assistant_reply_json (dict): The response object from the AI
-        assistant_reply (ChatModelResponse): The model response from the AI
-        config (Config): The config object
-
-    Returns:
-        tuple: The command name and arguments
-
-    Raises:
-        json.decoder.JSONDecodeError: If the response is not valid JSON
-
-        Exception: If any other error occurs
-    """
-    if config.openai_functions:
-        if assistant_reply.function_call is None:
-            return "Error:", "No 'function_call' in assistant reply"
-        assistant_reply_json["command"] = {
-            "name": assistant_reply.function_call.name,
-            "args": json.loads(assistant_reply.function_call.arguments),
-        }
-    try:
-        if "command" not in assistant_reply_json:
-            return "Error:", "Missing 'command' object in JSON"
-
-        if not isinstance(assistant_reply_json, dict):
-            return (
-                "Error:",
-                f"The previous message sent was not a dictionary {assistant_reply_json}",
-            )
-
-        command = assistant_reply_json["command"]
-        if not isinstance(command, dict):
-            return "Error:", "'command' object is not a dictionary"
-
-        if "name" not in command:
-            return "Error:", "Missing 'name' field in 'command' object"
-
-        command_name = command["name"]
-
-        # Use an empty dictionary if 'args' field is not present in 'command' object
-        arguments = command.get("args", {})
-
-        return command_name, arguments
-    except json.decoder.JSONDecodeError:
-        return "Error:", "Invalid JSON"
-    # All other errors, return "Error: + error message"
-    except Exception as e:
-        return "Error:", str(e)
-
-
-def execute_command(
-    command_name: str,
-    arguments: dict[str, str],
-    agent: Agent,
-):
-    """Execute the command and return the result
-
-    Args:
-        command_name (str): The name of the command to execute
-        arguments (dict): The arguments for the command
-        agent (Agent): The agent that is executing the command
-
-    Returns:
-        str: The result of the command
-    """
-    try:
-        # Execute a native command with the same name or alias, if it exists
-        if command := agent.command_registry.get_command(command_name):
-            return command(**arguments, agent=agent)
-
-        # Handle non-native commands (e.g. from plugins)
-        for command in agent.ai_config.prompt_generator.commands:
-            if (
-                command_name == command["label"].lower()
-                or command_name == command["name"].lower()
-            ):
-                return command["function"](**arguments)
-
-        raise RuntimeError(
-            f"Cannot execute '{command_name}': unknown command."
-            " Do not try to use this command again."
-        )
-    except Exception as e:
-        return f"Error: {str(e)}"
diff --git a/autogpt/agents/__init__.py b/autogpt/agents/__init__.py
new file mode 100644
index 000000000..a6df24ad7
--- /dev/null
+++ b/autogpt/agents/__init__.py
@@ -0,0 +1,3 @@
+from .agent import Agent
+
+__all__ = ["Agent"]
diff --git a/autogpt/agents/agent.py b/autogpt/agents/agent.py
new file mode 100644
index 000000000..316cc4d44
--- /dev/null
+++ b/autogpt/agents/agent.py
@@ -0,0 +1,398 @@
+import json
+import signal
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from colorama import Fore, Style
+
+from autogpt.config import Config
+from autogpt.config.ai_config import AIConfig
+from autogpt.json_utils.utilities import extract_json_from_response, validate_json
+from autogpt.llm import ChatModelResponse
+from autogpt.llm.chat import chat_with_ai
+from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS
+from autogpt.llm.utils import count_string_tokens
+from autogpt.logs import (
+    FULL_MESSAGE_HISTORY_FILE_NAME,
+    NEXT_ACTION_FILE_NAME,
+    USER_INPUT_FILE_NAME,
+    LogCycleHandler,
+    logger,
+    print_assistant_thoughts,
+    remove_ansi_escape,
+)
+from autogpt.memory.message_history import MessageHistory
+from autogpt.memory.vector import VectorMemory
+from autogpt.models.command_registry import CommandRegistry
+from autogpt.speech import say_text
+from autogpt.spinner import Spinner
+from autogpt.utils import clean_input
+from autogpt.workspace import Workspace
+
+
+class Agent:
+    """Agent class for interacting with Auto-GPT.
+
+    Attributes:
+        ai_name: The name of the agent.
+        memory: The memory object to use.
+        next_action_count: The number of actions to execute.
+        system_prompt: The system prompt is the initial prompt that defines everything
+          the AI needs to know to achieve its task successfully.
+        Currently, the dynamic and customizable information in the system prompt are
+          ai_name, description and goals.
+
+        triggering_prompt: The last sentence the AI will see before answering.
+            For Auto-GPT, this prompt is:
+            Determine exactly one command to use, and respond using the format specified
+              above:
+            The triggering prompt is not part of the system prompt because between the
+              system prompt and the triggering
+            prompt we have contextual information that can distract the AI and make it
+              forget that its goal is to find the next task to achieve.
+            SYSTEM PROMPT
+            CONTEXTUAL INFORMATION (memory, previous conversations, anything relevant)
+            TRIGGERING PROMPT
+
+        The triggering prompt reminds the AI about its short term meta task
+        (defining the next task)
+    """
+
+    def __init__(
+        self,
+        ai_name: str,
+        memory: VectorMemory,
+        next_action_count: int,
+        command_registry: CommandRegistry,
+        ai_config: AIConfig,
+        system_prompt: str,
+        triggering_prompt: str,
+        workspace_directory: str | Path,
+        config: Config,
+    ):
+        self.ai_name = ai_name
+        self.memory = memory
+        self.history = MessageHistory.for_model(config.smart_llm, agent=self)
+        self.next_action_count = next_action_count
+        self.command_registry = command_registry
+        self.config = config
+        self.ai_config = ai_config
+        self.system_prompt = system_prompt
+        self.triggering_prompt = triggering_prompt
+        self.workspace = Workspace(workspace_directory, config.restrict_to_workspace)
+        self.created_at = datetime.now().strftime("%Y%m%d_%H%M%S")
+        self.cycle_count = 0
+        self.log_cycle_handler = LogCycleHandler()
+        self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens
+
+    def start_interaction_loop(self):
+        # Interaction Loop
+        self.cycle_count = 0
+        command_name = None
+        arguments = None
+        user_input = ""
+
+        # Signal handler for interrupting y -N
+        def signal_handler(signum, frame):
+            if self.next_action_count == 0:
+                sys.exit()
+            else:
+                print(
+                    Fore.RED
+                    + "Interrupt signal received. Stopping continuous command execution."
+                    + Style.RESET_ALL
+                )
+                self.next_action_count = 0
+
+        signal.signal(signal.SIGINT, signal_handler)
+
+        while True:
+            # Discontinue if continuous limit is reached
+            self.cycle_count += 1
+            self.log_cycle_handler.log_count_within_cycle = 0
+            self.log_cycle_handler.log_cycle(
+                self.ai_config.ai_name,
+                self.created_at,
+                self.cycle_count,
+                [m.raw() for m in self.history],
+                FULL_MESSAGE_HISTORY_FILE_NAME,
+            )
+            if (
+                self.config.continuous_mode
+                and self.config.continuous_limit > 0
+                and self.cycle_count > self.config.continuous_limit
+            ):
+                logger.typewriter_log(
+                    "Continuous Limit Reached: ",
+                    Fore.YELLOW,
+                    f"{self.config.continuous_limit}",
+                )
+                break
+            # Send message to AI, get response
+            with Spinner("Thinking... ", plain_output=self.config.plain_output):
+                assistant_reply = chat_with_ai(
+                    self.config,
+                    self,
+                    self.system_prompt,
+                    self.triggering_prompt,
+                    self.smart_token_limit,
+                    self.config.smart_llm,
+                )
+
+            try:
+                assistant_reply_json = extract_json_from_response(
+                    assistant_reply.content
+                )
+                validate_json(assistant_reply_json, self.config)
+            except json.JSONDecodeError as e:
+                logger.error(f"Exception while validating assistant reply JSON: {e}")
+                assistant_reply_json = {}
+
+            for plugin in self.config.plugins:
+                if not plugin.can_handle_post_planning():
+                    continue
+                assistant_reply_json = plugin.post_planning(assistant_reply_json)
+
+            # Print Assistant thoughts
+            if assistant_reply_json != {}:
+                # Get command name and arguments
+                try:
+                    print_assistant_thoughts(
+                        self.ai_name, assistant_reply_json, self.config
+                    )
+                    command_name, arguments = extract_command(
+                        assistant_reply_json, assistant_reply, self.config
+                    )
+                    if self.config.speak_mode:
+                        say_text(f"I want to execute {command_name}", self.config)
+
+                except Exception as e:
+                    logger.error("Error: \n", str(e))
+            self.log_cycle_handler.log_cycle(
+                self.ai_config.ai_name,
+                self.created_at,
+                self.cycle_count,
+                assistant_reply_json,
+                NEXT_ACTION_FILE_NAME,
+            )
+
+            # First log new-line so user can differentiate sections better in console
+            logger.typewriter_log("\n")
+            logger.typewriter_log(
+                "NEXT ACTION: ",
+                Fore.CYAN,
+                f"COMMAND = {Fore.CYAN}{remove_ansi_escape(command_name)}{Style.RESET_ALL}  "
+                f"ARGUMENTS = {Fore.CYAN}{arguments}{Style.RESET_ALL}",
+            )
+
+            if not self.config.continuous_mode and self.next_action_count == 0:
+                # ### GET USER AUTHORIZATION TO EXECUTE COMMAND ###
+                # Get key press: Prompt the user to press enter to continue or escape
+                # to exit
+                self.user_input = ""
+                logger.info(
+                    f"Enter '{self.config.authorise_key}' to authorise command, "
+                    f"'{self.config.authorise_key} -N' to run N continuous commands, "
+                    f"'{self.config.exit_key}' to exit program, or enter feedback for "
+                    f"{self.ai_name}..."
+                )
+                while True:
+                    if self.config.chat_messages_enabled:
+                        console_input = clean_input(
+                            self.config, "Waiting for your response..."
+                        )
+                    else:
+                        console_input = clean_input(
+                            self.config, Fore.MAGENTA + "Input:" + Style.RESET_ALL
+                        )
+                    if console_input.lower().strip() == self.config.authorise_key:
+                        user_input = "GENERATE NEXT COMMAND JSON"
+                        break
+                    elif console_input.lower().strip() == "":
+                        logger.warn("Invalid input format.")
+                        continue
+                    elif console_input.lower().startswith(
+                        f"{self.config.authorise_key} -"
+                    ):
+                        try:
+                            self.next_action_count = abs(
+                                int(console_input.split(" ")[1])
+                            )
+                            user_input = "GENERATE NEXT COMMAND JSON"
+                        except ValueError:
+                            logger.warn(
+                                f"Invalid input format. Please enter '{self.config.authorise_key} -n' "
+                                "where n is the number of continuous tasks."
+                            )
+                            continue
+                        break
+                    elif console_input.lower() == self.config.exit_key:
+                        user_input = "EXIT"
+                        break
+                    else:
+                        user_input = console_input
+                        command_name = "human_feedback"
+                        self.log_cycle_handler.log_cycle(
+                            self.ai_config.ai_name,
+                            self.created_at,
+                            self.cycle_count,
+                            user_input,
+                            USER_INPUT_FILE_NAME,
+                        )
+                        break
+
+                if user_input == "GENERATE NEXT COMMAND JSON":
+                    logger.typewriter_log(
+                        "-=-=-=-=-=-=-= COMMAND AUTHORISED BY USER -=-=-=-=-=-=-=",
+                        Fore.MAGENTA,
+                        "",
+                    )
+                elif user_input == "EXIT":
+                    logger.info("Exiting...")
+                    break
+            else:
+                # First log new-line so user can differentiate sections better in console
+                logger.typewriter_log("\n")
+                # Print authorized commands left value
+                logger.typewriter_log(
+                    f"{Fore.CYAN}AUTHORISED COMMANDS LEFT: {Style.RESET_ALL}{self.next_action_count}"
+                )
+
+            # Execute command
+            if command_name is not None and command_name.lower().startswith("error"):
+                result = f"Could not execute command: {arguments}"
+            elif command_name == "human_feedback":
+                result = f"Human feedback: {user_input}"
+            else:
+                for plugin in self.config.plugins:
+                    if not plugin.can_handle_pre_command():
+                        continue
+                    command_name, arguments = plugin.pre_command(
+                        command_name, arguments
+                    )
+                command_result = execute_command(
+                    command_name=command_name,
+                    arguments=arguments,
+                    agent=self,
+                )
+                result = f"Command {command_name} returned: " f"{command_result}"
+
+                result_tlength = count_string_tokens(
+                    str(command_result), self.config.smart_llm
+                )
+                memory_tlength = count_string_tokens(
+                    str(self.history.summary_message()), self.config.smart_llm
+                )
+                if result_tlength + memory_tlength + 600 > self.smart_token_limit:
+                    result = f"Failure: command {command_name} returned too much output. \
+                        Do not execute this command again with the same arguments."
+
+                for plugin in self.config.plugins:
+                    if not plugin.can_handle_post_command():
+                        continue
+                    result = plugin.post_command(command_name, result)
+                if self.next_action_count > 0:
+                    self.next_action_count -= 1
+
+            # Check if there's a result from the command append it to the message
+            # history
+            if result is not None:
+                self.history.add("system", result, "action_result")
+                logger.typewriter_log("SYSTEM: ", Fore.YELLOW, result)
+            else:
+                self.history.add("system", "Unable to execute command", "action_result")
+                logger.typewriter_log(
+                    "SYSTEM: ", Fore.YELLOW, "Unable to execute command"
+                )
+
+
+def extract_command(
+    assistant_reply_json: dict, assistant_reply: ChatModelResponse, config: Config
+):
+    """Parse the response and return the command name and arguments
+
+    Args:
+        assistant_reply_json (dict): The response object from the AI
+        assistant_reply (ChatModelResponse): The model response from the AI
+        config (Config): The config object
+
+    Returns:
+        tuple: The command name and arguments
+
+    Raises:
+        json.decoder.JSONDecodeError: If the response is not valid JSON
+
+        Exception: If any other error occurs
+    """
+    if config.openai_functions:
+        if assistant_reply.function_call is None:
+            return "Error:", "No 'function_call' in assistant reply"
+        assistant_reply_json["command"] = {
+            "name": assistant_reply.function_call.name,
+            "args": json.loads(assistant_reply.function_call.arguments),
+        }
+    try:
+        if "command" not in assistant_reply_json:
+            return "Error:", "Missing 'command' object in JSON"
+
+        if not isinstance(assistant_reply_json, dict):
+            return (
+                "Error:",
+                f"The previous message sent was not a dictionary {assistant_reply_json}",
+            )
+
+        command = assistant_reply_json["command"]
+        if not isinstance(command, dict):
+            return "Error:", "'command' object is not a dictionary"
+
+        if "name" not in command:
+            return "Error:", "Missing 'name' field in 'command' object"
+
+        command_name = command["name"]
+
+        # Use an empty dictionary if 'args' field is not present in 'command' object
+        arguments = command.get("args", {})
+
+        return command_name, arguments
+    except json.decoder.JSONDecodeError:
+        return "Error:", "Invalid JSON"
+    # All other errors, return "Error: + error message"
+    except Exception as e:
+        return "Error:", str(e)
+
+
+def execute_command(
+    command_name: str,
+    arguments: dict[str, str],
+    agent: Agent,
+):
+    """Execute the command and return the result
+
+    Args:
+        command_name (str): The name of the command to execute
+        arguments (dict): The arguments for the command
+        agent (Agent): The agent that is executing the command
+
+    Returns:
+        str: The result of the command
+    """
+    try:
+        # Execute a native command with the same name or alias, if it exists
+        if command := agent.command_registry.get_command(command_name):
+            return command(**arguments, agent=agent)
+
+        # Handle non-native commands (e.g. from plugins)
+        for command in agent.ai_config.prompt_generator.commands:
+            if (
+                command_name == command["label"].lower()
+                or command_name == command["name"].lower()
+            ):
+                return command["function"](**arguments)
+
+        raise RuntimeError(
+            f"Cannot execute '{command_name}': unknown command."
+            " Do not try to use this command again."
+        )
+    except Exception as e:
+        return f"Error: {str(e)}"
diff --git a/autogpt/commands/decorators.py b/autogpt/commands/decorators.py
index 3528af04b..b63c76d53 100644
--- a/autogpt/commands/decorators.py
+++ b/autogpt/commands/decorators.py
@@ -2,7 +2,7 @@ import functools
 from pathlib import Path
 from typing import Callable
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.logs import logger
 
 
diff --git a/autogpt/commands/execute_code.py b/autogpt/commands/execute_code.py
index aad93193e..2403b2ba5 100644
--- a/autogpt/commands/execute_code.py
+++ b/autogpt/commands/execute_code.py
@@ -7,7 +7,7 @@ import docker
 from docker.errors import DockerException, ImageNotFound
 from docker.models.containers import Container as DockerContainer
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.command_decorator import command
 from autogpt.config import Config
 from autogpt.logs import logger
diff --git a/autogpt/commands/file_operations.py b/autogpt/commands/file_operations.py
index 0a06da318..939b7dc18 100644
--- a/autogpt/commands/file_operations.py
+++ b/autogpt/commands/file_operations.py
@@ -8,7 +8,7 @@ import os.path
 from pathlib import Path
 from typing import Generator, Literal
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.command_decorator import command
 from autogpt.logs import logger
 from autogpt.memory.vector import MemoryItem, VectorMemory
diff --git a/autogpt/commands/git_operations.py b/autogpt/commands/git_operations.py
index 276031f78..021157fbb 100644
--- a/autogpt/commands/git_operations.py
+++ b/autogpt/commands/git_operations.py
@@ -2,7 +2,7 @@
 
 from git.repo import Repo
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.command_decorator import command
 from autogpt.url_utils.validators import validate_url
 
diff --git a/autogpt/commands/image_gen.py b/autogpt/commands/image_gen.py
index b1a89b289..abae6149e 100644
--- a/autogpt/commands/image_gen.py
+++ b/autogpt/commands/image_gen.py
@@ -9,7 +9,7 @@ import openai
 import requests
 from PIL import Image
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.command_decorator import command
 from autogpt.logs import logger
 
diff --git a/autogpt/commands/task_statuses.py b/autogpt/commands/task_statuses.py
index 062ebe3a4..34908928f 100644
--- a/autogpt/commands/task_statuses.py
+++ b/autogpt/commands/task_statuses.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 
 from typing import NoReturn
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.command_decorator import command
 from autogpt.logs import logger
 
diff --git a/autogpt/commands/web_search.py b/autogpt/commands/web_search.py
index d47d680b2..9ea0d2061 100644
--- a/autogpt/commands/web_search.py
+++ b/autogpt/commands/web_search.py
@@ -7,7 +7,7 @@ from itertools import islice
 
 from duckduckgo_search import DDGS
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.command_decorator import command
 
 DUCKDUCKGO_MAX_ATTEMPTS = 3
diff --git a/autogpt/commands/web_selenium.py b/autogpt/commands/web_selenium.py
index 821957f3e..948d799e9 100644
--- a/autogpt/commands/web_selenium.py
+++ b/autogpt/commands/web_selenium.py
@@ -27,7 +27,7 @@ from webdriver_manager.chrome import ChromeDriverManager
 from webdriver_manager.firefox import GeckoDriverManager
 from webdriver_manager.microsoft import EdgeChromiumDriverManager as EdgeDriverManager
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.command_decorator import command
 from autogpt.logs import logger
 from autogpt.memory.vector import MemoryItem, get_memory
diff --git a/autogpt/llm/chat.py b/autogpt/llm/chat.py
index 4364cb1d8..f08fdab4e 100644
--- a/autogpt/llm/chat.py
+++ b/autogpt/llm/chat.py
@@ -4,7 +4,7 @@ import time
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from autogpt.agent.agent import Agent
+    from autogpt.agents.agent import Agent
 
 from autogpt.config import Config
 from autogpt.llm.api_manager import ApiManager
diff --git a/autogpt/main.py b/autogpt/main.py
index 4ef3fc949..0da2d193b 100644
--- a/autogpt/main.py
+++ b/autogpt/main.py
@@ -6,7 +6,7 @@ from typing import Optional
 
 from colorama import Fore, Style
 
-from autogpt.agent import Agent
+from autogpt.agents import Agent
 from autogpt.config.config import ConfigBuilder, check_openai_api_key
 from autogpt.configurator import create_config
 from autogpt.logs import logger
diff --git a/autogpt/memory/message_history.py b/autogpt/memory/message_history.py
index 30dbbb809..c718f2edb 100644
--- a/autogpt/memory/message_history.py
+++ b/autogpt/memory/message_history.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
-    from autogpt.agent import Agent
+    from autogpt.agents import Agent
 
 from autogpt.config import Config
 from autogpt.json_utils.utilities import extract_json_from_response
diff --git a/benchmarks.py b/benchmarks.py
index cb592be8a..2e143f9d6 100644
--- a/benchmarks.py
+++ b/benchmarks.py
@@ -1,4 +1,4 @@
-from autogpt.agent import Agent
+from autogpt.agents import Agent
 from autogpt.config import AIConfig, Config, ConfigBuilder
 from autogpt.main import COMMAND_CATEGORIES
 from autogpt.memory.vector import get_memory
diff --git a/tests/challenges/debug_code/test_debug_code_challenge_a.py b/tests/challenges/debug_code/test_debug_code_challenge_a.py
index 90a7084dc..c846f9ce5 100644
--- a/tests/challenges/debug_code/test_debug_code_challenge_a.py
+++ b/tests/challenges/debug_code/test_debug_code_challenge_a.py
@@ -3,7 +3,7 @@ from pathlib import Path
 import pytest
 from pytest_mock import MockerFixture
 
-from autogpt.agent import Agent
+from autogpt.agents import Agent
 from autogpt.commands.execute_code import execute_python_file
 from autogpt.workspace import Workspace
 from tests.challenges.challenge_decorator.challenge_decorator import challenge
diff --git a/tests/conftest.py b/tests/conftest.py
index 64e840247..09d358e69 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,7 +6,7 @@ import pytest
 import yaml
 from pytest_mock import MockerFixture
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.config import AIConfig, Config, ConfigBuilder
 from autogpt.config.ai_config import AIConfig
 from autogpt.llm.api_manager import ApiManager
diff --git a/tests/integration/agent_factory.py b/tests/integration/agent_factory.py
index 664c6cbb4..d3832c27a 100644
--- a/tests/integration/agent_factory.py
+++ b/tests/integration/agent_factory.py
@@ -1,6 +1,6 @@
 import pytest
 
-from autogpt.agent import Agent
+from autogpt.agents import Agent
 from autogpt.config import AIConfig, Config
 from autogpt.memory.vector import get_memory
 from autogpt.models.command_registry import CommandRegistry
diff --git a/tests/integration/test_execute_code.py b/tests/integration/test_execute_code.py
index b7be8622a..80010c6f2 100644
--- a/tests/integration/test_execute_code.py
+++ b/tests/integration/test_execute_code.py
@@ -7,7 +7,7 @@ import tempfile
 import pytest
 
 import autogpt.commands.execute_code as sut  # system under testing
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.config import Config
 
 
diff --git a/tests/integration/test_image_gen.py b/tests/integration/test_image_gen.py
index 8cdcfd986..0a9f68978 100644
--- a/tests/integration/test_image_gen.py
+++ b/tests/integration/test_image_gen.py
@@ -6,7 +6,7 @@ from unittest.mock import patch
 import pytest
 from PIL import Image
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.commands.image_gen import generate_image, generate_image_with_sd_webui
 
 
diff --git a/tests/integration/test_web_selenium.py b/tests/integration/test_web_selenium.py
index e900b4b3f..43de2860e 100644
--- a/tests/integration/test_web_selenium.py
+++ b/tests/integration/test_web_selenium.py
@@ -1,7 +1,7 @@
 import pytest
 from pytest_mock import MockerFixture
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.commands.web_selenium import browse_website
 
 
diff --git a/tests/unit/test_agent.py b/tests/unit/test_agent.py
index 351454be0..7baeeb64f 100644
--- a/tests/unit/test_agent.py
+++ b/tests/unit/test_agent.py
@@ -1,4 +1,4 @@
-from autogpt.agent.agent import Agent, execute_command
+from autogpt.agents.agent import Agent, execute_command
 
 
 def test_agent_initialization(agent: Agent):
diff --git a/tests/unit/test_file_operations.py b/tests/unit/test_file_operations.py
index f9c571d8c..d7d870a59 100644
--- a/tests/unit/test_file_operations.py
+++ b/tests/unit/test_file_operations.py
@@ -12,7 +12,7 @@ import pytest
 from pytest_mock import MockerFixture
 
 import autogpt.commands.file_operations as file_ops
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.config import Config
 from autogpt.memory.vector.memory_item import MemoryItem
 from autogpt.memory.vector.utils import Embedding
diff --git a/tests/unit/test_git_commands.py b/tests/unit/test_git_commands.py
index a6defdfc3..9f56a3840 100644
--- a/tests/unit/test_git_commands.py
+++ b/tests/unit/test_git_commands.py
@@ -2,7 +2,7 @@ import pytest
 from git.exc import GitCommandError
 from git.repo.base import Repo
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.commands.git_operations import clone_repository
 
 
diff --git a/tests/unit/test_message_history.py b/tests/unit/test_message_history.py
index 9b275252c..ec01cd558 100644
--- a/tests/unit/test_message_history.py
+++ b/tests/unit/test_message_history.py
@@ -4,7 +4,7 @@ from unittest.mock import MagicMock
 
 import pytest
 
-from autogpt.agent import Agent
+from autogpt.agents import Agent
 from autogpt.config import AIConfig
 from autogpt.config.config import Config
 from autogpt.llm.base import ChatModelResponse, ChatSequence, Message
diff --git a/tests/unit/test_web_search.py b/tests/unit/test_web_search.py
index 4f5143069..790b1c2f6 100644
--- a/tests/unit/test_web_search.py
+++ b/tests/unit/test_web_search.py
@@ -3,7 +3,7 @@ import json
 import pytest
 from googleapiclient.errors import HttpError
 
-from autogpt.agent.agent import Agent
+from autogpt.agents.agent import Agent
 from autogpt.commands.web_search import google, safe_google_results, web_search
 
 
-- 
cgit v1.2.3


From a0f5aa942de3a94ce6a173c784d7a73b17a134a6 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <github@pwuts.nl>
Date: Thu, 13 Jul 2023 18:35:50 +0200
Subject: Fix Netlify preview builds

---
 netlify.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/netlify.toml b/netlify.toml
index 43e79f0fd..de261908f 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -3,4 +3,4 @@
 [build]
   publish = "public/"
   command = "mkdocs build -d public"
-  ignore = "git diff --quiet HEAD^ HEAD docs mkdocs.yml CONTRIBUTING.md CODE_OF_CONDUCT.md LICENSE"
+  ignore = "git diff --quiet $CACHED_COMMIT_REF $COMMIT_REF docs mkdocs.yml CONTRIBUTING.md CODE_OF_CONDUCT.md LICENSE"
-- 
cgit v1.2.3


From ed12b2c7d6036e16ac49123d62963f0819861fdd Mon Sep 17 00:00:00 2001
From: Antonov Maxim <99024963+antonovmaxim@users.noreply.github.com>
Date: Thu, 13 Jul 2023 20:21:25 +0300
Subject: Allow absolute paths when not restricting to workspace root (#4946)

* restrict_to_root fix

* Fix formatting

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
---
 autogpt/workspace/workspace.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/autogpt/workspace/workspace.py b/autogpt/workspace/workspace.py
index 6d90f8540..07186e735 100644
--- a/autogpt/workspace/workspace.py
+++ b/autogpt/workspace/workspace.py
@@ -123,7 +123,11 @@ class Workspace:
         logger.debug(f"Resolved root as '{root}'")
 
         # Allow exception for absolute paths if they are contained in your workspace directory.
-        if relative_path.is_absolute() and not relative_path.is_relative_to(root):
+        if (
+            relative_path.is_absolute()
+            and restrict_to_root
+            and not relative_path.is_relative_to(root)
+        ):
             raise ValueError(
                 f"Attempted to access absolute path '{relative_path}' in workspace '{root}'."
             )
-- 
cgit v1.2.3


From 7c4fc45b4a536e6423b6491c34953a2516863d78 Mon Sep 17 00:00:00 2001
From: Vasek Mlejnsky <vasek.mlejnsky@gmail.com>
Date: Thu, 13 Jul 2023 18:14:57 -0600
Subject: Add initial share logs page (#4965)

* Add initial share logs page

* Fix title sizes

* Update share_logs.md

Added some text for reasons to share logs

* Add section on how to share logs using e2b

* Fix path to images with sizes

* Fix paths to images in docs

* Fix formatting

* Fix formatting

* Fix grammar

* Make position in menu more prominent

* original log directory was incorrect

I took the directory from usage.md but that is incorrect

* Updated the directory for the logs

Updated the directory for the logs

* added some text

and made it pretty

---------

Co-authored-by: NeonN3mesis <129052650+NeonN3mesis@users.noreply.github.com>
Co-authored-by: Reinier van der Leer <github@pwuts.nl>
---
 docs/imgs/e2b-dashboard.png  | Bin 0 -> 515634 bytes
 docs/imgs/e2b-log-url.png    | Bin 0 -> 43687 bytes
 docs/imgs/e2b-new-tag.png    | Bin 0 -> 47736 bytes
 docs/imgs/e2b-tag-button.png | Bin 0 -> 20635 bytes
 docs/share-your-logs.md      |  52 +++++++++++++++++++++++++++++++++++++++++++
 docs/usage.md                |  12 ++++++++--
 mkdocs.yml                   |   3 ++-
 7 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 docs/imgs/e2b-dashboard.png
 create mode 100644 docs/imgs/e2b-log-url.png
 create mode 100644 docs/imgs/e2b-new-tag.png
 create mode 100644 docs/imgs/e2b-tag-button.png
 create mode 100644 docs/share-your-logs.md

diff --git a/docs/imgs/e2b-dashboard.png b/docs/imgs/e2b-dashboard.png
new file mode 100644
index 000000000..456f1490c
Binary files /dev/null and b/docs/imgs/e2b-dashboard.png differ
diff --git a/docs/imgs/e2b-log-url.png b/docs/imgs/e2b-log-url.png
new file mode 100644
index 000000000..3f1c189ee
Binary files /dev/null and b/docs/imgs/e2b-log-url.png differ
diff --git a/docs/imgs/e2b-new-tag.png b/docs/imgs/e2b-new-tag.png
new file mode 100644
index 000000000..65a0a767c
Binary files /dev/null and b/docs/imgs/e2b-new-tag.png differ
diff --git a/docs/imgs/e2b-tag-button.png b/docs/imgs/e2b-tag-button.png
new file mode 100644
index 000000000..741a6bac1
Binary files /dev/null and b/docs/imgs/e2b-tag-button.png differ
diff --git a/docs/share-your-logs.md b/docs/share-your-logs.md
new file mode 100644
index 000000000..f673e375c
--- /dev/null
+++ b/docs/share-your-logs.md
@@ -0,0 +1,52 @@
+## Share your logs with us to help improve Auto-GPT
+
+Do you notice weird behavior with your agent? Do you have an interesting use case? Do you have a bug you want to report?
+Follow the steps below to enable your logs and upload them. You can include these logs when making an issue report or discussing an issue with us.
+
+### Enable Debug Logs
+Activity, Error, and Debug logs are located in `./logs`
+
+To print out debug logs:
+
+``` shell
+./run.sh --debug     # on Linux / macOS
+
+.\run.bat --debug    # on Windows
+
+docker-compose run --rm auto-gpt --debug    # in Docker
+```
+
+### Inspect and share logs
+You can inspect and share logs via [e2b](https://e2b.dev).
+![E2b logs dashboard](./imgs/e2b-dashboard.png)
+
+
+
+1. Go to [autogpt.e2b.dev](https://autogpt.e2b.dev) and sign in.
+2. You'll see logs from other members of the AutoGPT team that you can inspect.
+3. Or you upload your own logs. Click on the "Upload log folder" button and select the debug logs dir that you generated. Wait a 1-2 seconds and the page reloads.
+4. You can share logs via sharing the URL in your browser.
+![E2b log URL](./imgs/e2b-log-url.png)
+
+
+### Add tags to logs
+You can add custom tags to logs for other members of your team. This is useful if you want to indicate that the agent is for example having issues with challenges.
+
+E2b offers 3 types of severity:
+
+- Success
+- Warning
+- Error
+
+You can name your tag any way you want.
+
+#### How to add a tag
+1. Click on the "plus" button on the left from the logs folder name.
+
+![E2b tag button](./imgs/e2b-tag-button.png)
+
+2. Type the name of a new tag.
+
+3. Select the severity.
+
+![E2b new tag](./imgs/e2b-new-tag.png)
diff --git a/docs/usage.md b/docs/usage.md
index a9ef2883e..cb74ef7f6 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -89,12 +89,20 @@ This may give your bot increased intelligence.
 
 ## Logs
 
-Activity and error logs are located in the `./output/logs`
+Activity, Error, and Debug logs are located in `./logs`
+
+!!! tip 
+    Do you notice weird behavior with your agent? Do you have an interesting use case? Do you have a bug you want to report?
+    Follow the step below to enable your logs. You can include these logs when making an issue report or discussing an issue with us.
 
 To print out debug logs:
 
 ``` shell
-./run.sh --debug
+./run.sh --debug     # on Linux / macOS
+
+.\run.bat --debug    # on Windows
+
+docker-compose run --rm auto-gpt --debug    # in Docker
 ```
 
 ## Disabling Command Categories
diff --git a/mkdocs.yml b/mkdocs.yml
index 50e062571..a85004453 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -12,7 +12,8 @@ nav:
     - Voice: configuration/voice.md
     - Image Generation: configuration/imagegen.md
 
-  - Contributing:
+  - Help us improve Auto-GPT:
+    - Share your debug logs with us: share-your-logs.md
     - Contribution guide: contributing.md
     - Running tests: testing.md
     - Code of Conduct: code-of-conduct.md
-- 
cgit v1.2.3


From 3a9dfa4c594dd7628de0a1b4bb9e3a15f1c1f172 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Thu, 13 Jul 2023 20:47:55 -0700
Subject: Update submodules and upload artifacts (#97)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/ci.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 34eedb292..0e9263861 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -135,15 +135,15 @@ jobs:
           fi
 
           pip install ../../dist/*.whl
-
-          if [ "${GITHUB_EVENT_NAME}" == "schedule" ] || [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]; then
-            curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
-            agbenchmark start --maintain
-          else
+          
+          if [ "${GITHUB_EVENT_NAME}" == "pull_request" ]; then
             agbenchmark start --maintain --mock
             agbenchmark start --improve --mock
             agbenchmark start --mock
             agbenchmark start --mock --category=retrieval
+          else
+            curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
+            agbenchmark start --maintain
           fi
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
@@ -155,9 +155,10 @@ jobs:
           HELICONE_CACHE_ENABLED: true
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
 
-      - name: Upload logs as artifact
+
+      - name: Upload reports
         if: always()
         uses: actions/upload-artifact@v3
         with:
-          name: gpt-engineer-projects
-          path: agent/gpt-engineer/projects
+          name: ${{ matrix.agent-name }}
+          path: agent/${{ matrix.agent-name }}/agbenchmark
-- 
cgit v1.2.3


From 376ecf0c5f1180739b94b68b4ee5cdcd7dca2f09 Mon Sep 17 00:00:00 2001
From: GECORegulatory <121075828+GECORegulatory@users.noreply.github.com>
Date: Fri, 14 Jul 2023 12:24:49 -0400
Subject: Replaced Fictitious color name Fore.ORANGE (#4972)

Changed Colorama.Fore.ORANGE to YELLOW in config.py, As Colorama does not support an ORANGE color. This fixes a fatal error in run.sh when trying to set the API key through the input() method.

Co-authored-by: James Collins <collijk@uw.edu>
---
 autogpt/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autogpt/config/config.py b/autogpt/config/config.py
index ae2f7bedc..cb3f26d3e 100644
--- a/autogpt/config/config.py
+++ b/autogpt/config/config.py
@@ -367,7 +367,7 @@ def check_openai_api_key(config: Config) -> None:
             print(
                 Fore.GREEN
                 + "OpenAI API key successfully set!\n"
-                + Fore.ORANGE
+                + Fore.YELLOW
                 + "NOTE: The API key you've set is only temporary.\n"
                 + "For longer sessions, please set it in .env file"
                 + Fore.RESET
-- 
cgit v1.2.3


From a9702e4629d4b1d90d118b4dabbbb665f5635e97 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 14 Jul 2023 10:27:48 -0700
Subject: Add basic code generation challenge (#98)

---
 .github/workflows/ci.yml                           |  4 +++
 agbenchmark/challenge.py                           |  7 +++++
 agbenchmark/challenges/README.md                   | 16 +++++++++++
 .../challenges/code/d4/artifacts_out/__init__.py   |  0
 .../challenges/code/d4/artifacts_out/code.py       | 12 +++++++++
 agbenchmark/challenges/code/d4/data.json           | 18 +++++++++++++
 .../challenges/code/d4/hidden_files/test.py        | 31 ++++++++++++++++++++++
 agent/gpt-engineer                                 |  2 +-
 pyproject.toml                                     |  4 +--
 9 files changed, 91 insertions(+), 3 deletions(-)
 create mode 100644 agbenchmark/challenges/code/d4/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d4/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d4/data.json
 create mode 100644 agbenchmark/challenges/code/d4/hidden_files/test.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 0e9263861..b7864db6a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -141,6 +141,10 @@ jobs:
             agbenchmark start --improve --mock
             agbenchmark start --mock
             agbenchmark start --mock --category=retrieval
+            agbenchmark start --mock --category=interface
+            agbenchmark start --mock --category=code
+            agbenchmark start --mock --category=memory
+            agbenchmark start --mock --category=iterate
           else
             curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
             agbenchmark start --maintain
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index aeebd7ad8..874fd45bd 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -46,6 +46,13 @@ class Challenge(ABC):
 
         run_agent(self.task, config, self.CHALLENGE_LOCATION)
 
+        # hidden files are added after the agent runs. Hidden files can be python test files.
+        # We copy them in the workspace to make it easy to import the code produced by the agent
+
+        copy_artifacts_into_workspace(
+            config["workspace"], "hidden_files", self.CHALLENGE_LOCATION
+        )
+
     def test_method(self, config: Dict[str, Any]) -> None:
         raise NotImplementedError
 
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index 305cd28f1..a890c9d36 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -46,3 +46,19 @@ Example:
 Current Output:
 
 - **score** (float): scores range from [0, 1]
+
+## Add files to challenges: 
+
+### artifacts_in
+
+This folder contains all the files you want the agent to have in its workspace BEFORE the challenge starts
+
+### artifacts_out
+This folder contains all the files you would like the agent to generate. This folder is used to mock the agent.
+This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works.
+
+### hidden_files
+This folder contains files hidden from the agent but useful to assess whether a challenge is successful.
+For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge.
+This allows us to run this test.py and easily import code generated by the agent.
+For example see: TestBasicCodeGeneration challenge.
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d4/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d4/artifacts_out/code.py
new file mode 100644
index 000000000..de3d8c62c
--- /dev/null
+++ b/agbenchmark/challenges/code/d4/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json
new file mode 100644
index 000000000..51f6f2702
--- /dev/null
+++ b/agbenchmark/challenges/code/d4/data.json
@@ -0,0 +1,18 @@
+{
+  "name": "TestBasicCodeGeneration",
+  "category": ["code", "iterate"],
+  "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+  "dependencies": ["TestWriteFile"],
+  "ground": {
+    "answer": "The two_sum function coded properly.",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "Tests ability for the agent to create the two_sum function.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/code/d4/hidden_files/test.py b/agbenchmark/challenges/code/d4/hidden_files/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d4/hidden_files/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 521d626c0..bca191cd7 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 521d626c0075ed6545f01b771757c856f8addbd6
+Subproject commit bca191cd76cdea0335da91d004c64d9bb8520fea
diff --git a/pyproject.toml b/pyproject.toml
index b0526ab57..48be9cf5d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,10 +38,10 @@ testpaths = [
 ]
 markers = [
     "retrieval",
-    "regression",
     "interface",
     "code",
-    "memory"
+    "memory",
+    "iterate"
 ]
 
 [tool.poetry.scripts]
-- 
cgit v1.2.3


From 7bc7d9213df32cabf8e96f422741c037b7817487 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 14 Jul 2023 14:39:47 -0700
Subject: Replace hidden files with custom python (#99)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 agbenchmark/challenge.py                           |  2 +-
 agbenchmark/challenges/README.md                   |  9 +++----
 .../challenges/code/d3/custom_python/api_tests.py  | 14 ++++++++++
 agbenchmark/challenges/code/d3/data.json           |  6 ++---
 .../challenges/code/d4/custom_python/test.py       | 31 ++++++++++++++++++++++
 .../challenges/code/d4/hidden_files/test.py        | 31 ----------------------
 agbenchmark/challenges/test_all.py                 | 20 +-------------
 7 files changed, 54 insertions(+), 59 deletions(-)
 create mode 100644 agbenchmark/challenges/code/d4/custom_python/test.py
 delete mode 100644 agbenchmark/challenges/code/d4/hidden_files/test.py

diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index 874fd45bd..f07faf8ee 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -50,7 +50,7 @@ class Challenge(ABC):
         # We copy them in the workspace to make it easy to import the code produced by the agent
 
         copy_artifacts_into_workspace(
-            config["workspace"], "hidden_files", self.CHALLENGE_LOCATION
+            config["workspace"], "custom_python", self.CHALLENGE_LOCATION
         )
 
     def test_method(self, config: Dict[str, Any]) -> None:
diff --git a/agbenchmark/challenges/README.md b/agbenchmark/challenges/README.md
index a890c9d36..34e35154e 100644
--- a/agbenchmark/challenges/README.md
+++ b/agbenchmark/challenges/README.md
@@ -57,8 +57,7 @@ This folder contains all the files you want the agent to have in its workspace B
 This folder contains all the files you would like the agent to generate. This folder is used to mock the agent.
 This allows to run agbenchmark start --test=TestExample --mock and make sure our challenge actually works.
 
-### hidden_files
-This folder contains files hidden from the agent but useful to assess whether a challenge is successful.
-For example we can have a test.py in it, and this test.py will be added to the workspace at the end of a challenge.
-This allows us to run this test.py and easily import code generated by the agent.
-For example see: TestBasicCodeGeneration challenge.
+### custom_python
+This folder contains files that will be copied into the agent's workspace and run after the challenge is completed.
+For example we can have a test.py in it and run this file in the workspace to easily import code generated by the agent.
+Example: TestBasicCodeGeneration challenge.
diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d3/custom_python/api_tests.py
index 1d6255ebd..f01934ef8 100644
--- a/agbenchmark/challenges/code/d3/custom_python/api_tests.py
+++ b/agbenchmark/challenges/code/d3/custom_python/api_tests.py
@@ -5,6 +5,15 @@ from unittest.mock import Mock, patch
 import requests
 
 
+def test_make_request_and_assert() -> None:
+    result = make_request_and_assert()
+    print(result)
+    expected_result = {"status": "OK"}
+    error_message = f"AssertionError: Expected the output to be {expected_result}"
+    print(error_message)
+    assert result == expected_result, error_message
+
+
 def make_assertion() -> None:
     if os.environ.get("MOCK_TEST", "False").lower() == "true":
         mock_response = Mock(requests.Response)
@@ -25,3 +34,8 @@ def make_request_and_assert() -> Dict[str, Any]:
         )
 
     return response.json()
+
+
+if __name__ == "__main__":
+    # test for the case when server is healthy
+    test_make_request_and_assert()
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json
index 94c81664c..ae0e45581 100644
--- a/agbenchmark/challenges/code/d3/data.json
+++ b/agbenchmark/challenges/code/d3/data.json
@@ -6,9 +6,9 @@
   "ground": {
     "answer": "GET localhost:8079/health responds with a 200 OK",
     "should_contain": [],
-    "should_not_contain": [],
-    "files": [],
-    "type": "custom_python"
+    "should_not_contain": ["AssertionError"],
+    "files": ["test.py"],
+    "type": "execute_python_code"
   },
   "info": {
     "difficulty": "advanced",
diff --git a/agbenchmark/challenges/code/d4/custom_python/test.py b/agbenchmark/challenges/code/d4/custom_python/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d4/custom_python/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d4/hidden_files/test.py b/agbenchmark/challenges/code/d4/hidden_files/test.py
deleted file mode 100644
index d85d13537..000000000
--- a/agbenchmark/challenges/code/d4/hidden_files/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# mypy: ignore-errors
-from code import two_sum
-from typing import List
-
-
-def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
-    result = two_sum(nums, target)
-    print(result)
-    assert (
-        result == expected_result
-    ), f"AssertionError: Expected the output to be {expected_result}"
-
-
-if __name__ == "__main__":
-    # test the trivial case with the first two numbers
-    nums = [2, 7, 11, 15]
-    target = 9
-    expected_result = [0, 1]
-    test_two_sum(nums, target, expected_result)
-
-    # test for ability to use zero and the same number twice
-    nums = [2, 7, 0, 15, 12, 0]
-    target = 0
-    expected_result = [2, 5]
-    test_two_sum(nums, target, expected_result)
-
-    # test for first and last index usage and negative numbers
-    nums = [-6, 7, 11, 4]
-    target = -2
-    expected_result = [0, 3]
-    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index a5afef96c..98a5ab81a 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -2,8 +2,6 @@ import glob
 import importlib
 import json
 import os
-import pkgutil
-import sys
 import types
 from pathlib import Path
 from typing import Any, Dict
@@ -60,23 +58,7 @@ def generate_tests() -> None:
 
             scores = self.get_scores(config)
 
-            # Check if make_assertion is defined and use it
-            if self.data.ground.type == "custom_python":
-                custom_python_location = (
-                    f"{CURRENT_DIRECTORY}/../{challenge_location}/custom_python"
-                )
-                sys.path.append(str(custom_python_location))
-
-                for module_loader, name, ispkg in pkgutil.iter_modules(
-                    [str(custom_python_location)]
-                ):
-                    module = importlib.import_module(name)
-
-                    if hasattr(module, "make_assertion"):
-                        make_assertion = getattr(module, "make_assertion")
-                        make_assertion()
-            else:
-                assert 1 in scores
+            assert 1 in scores
 
         # Parametrize the method here
         test_method = pytest.mark.parametrize(
-- 
cgit v1.2.3


From 281cb0ef37c3b8934af787f6681858b0c472556b Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 14 Jul 2023 14:56:56 -0700
Subject: Start showing benchmark results (#100)

---
 README.md | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index ed348b5ab..e73f39891 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,26 @@
 
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
-### Scores:
+## Scores:
+Spider chart for each agent coming soon !
 
-Scoring of agents will go here. Both overall and by category.
+## Detailed results
+:warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon.
 
-### Integrated Agents
+### Auto-GPT
+Coming Soon!
 
-- Auto-GPT
-- gpt-engineer
-- mini-agi
-- smol-developer
+### gpt-engineer
+
+| Task                              | Results              |
+|-----------------------------------|----------------------|
+| Debug Simple Typo With Guidance   | :x:                  |
+| Debug Simple Typo Without Guidance| :x:                  |
+| Basic Code Generation             | :white_check_mark:   |
+| Create Simple Web Server          | :x:                  |
+
+### mini-agi
+Coming Soon!
+
+### smol-developer
+Coming Soon!
-- 
cgit v1.2.3


From c821b294c619c604099b7a9497cc967d9e65ca29 Mon Sep 17 00:00:00 2001
From: ido777 <ophir.ido@gmail.com>
Date: Sat, 15 Jul 2023 01:23:59 +0300
Subject: Fix orjson encoding text with UTF-8 surrogates (#3666)

* added lib ftfy (fixes text for you), to solve surrogates errors

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
---
 autogpt/memory/vector/memory_item.py | 4 ++++
 requirements.txt                     | 1 +
 2 files changed, 5 insertions(+)

diff --git a/autogpt/memory/vector/memory_item.py b/autogpt/memory/vector/memory_item.py
index 587a915b4..f7a7fe6e8 100644
--- a/autogpt/memory/vector/memory_item.py
+++ b/autogpt/memory/vector/memory_item.py
@@ -4,6 +4,7 @@ import dataclasses
 import json
 from typing import Literal
 
+import ftfy
 import numpy as np
 
 from autogpt.config import Config
@@ -43,6 +44,9 @@ class MemoryItem:
     ):
         logger.debug(f"Memorizing text:\n{'-'*32}\n{text}\n{'-'*32}\n")
 
+        # Fix encoding, e.g. removing unicode surrogates (see issue #778)
+        text = ftfy.fix_text(text)
+
         chunks = [
             chunk
             for chunk, _ in (
diff --git a/requirements.txt b/requirements.txt
index 47aa08a69..4af8bccd9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,7 @@ google-api-python-client #(https://developers.google.com/custom-search/v1/overvi
 pinecone-client==2.2.1
 redis
 orjson==3.8.10
+ftfy>=6.1.1
 Pillow
 selenium==4.1.4
 webdriver-manager
-- 
cgit v1.2.3


From 7de965ab3fa77c724458512053993ce16c3d600f Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 14 Jul 2023 18:04:35 -0700
Subject: Show Auto-GPT results (#102)

---
 README.md      | 36 +++++++++++++++++++++++++++++++++++-
 agent/Auto-GPT |  2 +-
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e73f39891..fa06317c5 100644
--- a/README.md
+++ b/README.md
@@ -9,10 +9,44 @@ Spider chart for each agent coming soon !
 :warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon.
 
 ### Auto-GPT
-Coming Soon!
+Interface
+
+| Task         | Results             |
+|--------------|---------------------|
+| Write File   | :white_check_mark:  |
+| Read File    | :white_check_mark:  |
+| Search File  | :x:                 |
+
+Code
+
+| Task                              | Results              |
+|-----------------------------------|----------------------|
+| Debug Simple Typo With Guidance   | :x:                  |
+| Debug Simple Typo Without Guidance| :x:                  |
+| Basic Code Generation             | :white_check_mark:   |
+| Create Simple Web Server          | :x:                  |
+
+Memory
+
+| Task                                       | Results            |
+|--------------------------------------------|--------------------|
+| Basic Memory                               | :white_check_mark: |
+| Remember Multiple Ids                      | :x:                |
+| Remember Multiple Ids With Noise           | :x:                |
+| Remember Multiple Phrases With Noise       | :x:                |
 
 ### gpt-engineer
 
+Interface
+
+| Task        | Results            |
+|-------------|--------------------|
+| Write File  | :white_check_mark: |
+| Read File   | :white_check_mark: |
+| Search File | :x:                |
+
+Code
+
 | Task                              | Results              |
 |-----------------------------------|----------------------|
 | Debug Simple Typo With Guidance   | :x:                  |
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index d4fc134f8..9079f6641 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit d4fc134f8c4bd7b63f283f932f68932317f53f78
+Subproject commit 9079f66417f2480d0f5764fb0f916d3241b3fae8
-- 
cgit v1.2.3


From 5ae044f53db4af1b8a54ef8c7e2afb17e67568b9 Mon Sep 17 00:00:00 2001
From: Lei Zhang <isleizhang@outlook.com>
Date: Sat, 15 Jul 2023 09:10:32 +0800
Subject: Integrate `plugin.handle_text_embedding` hook (#2804)

* add feature custom text embedding in plugin

* black code format

* _get_embedding_with_plugin()

* Fix docstring & type hint

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
---
 autogpt/memory/vector/utils.py                 | 22 ++++++++++++++++++++--
 autogpt/models/base_open_ai_plugin.py          | 12 +++++++-----
 tests/unit/models/test_base_open_api_plugin.py |  2 ++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/autogpt/memory/vector/utils.py b/autogpt/memory/vector/utils.py
index eb6912566..1b050d562 100644
--- a/autogpt/memory/vector/utils.py
+++ b/autogpt/memory/vector/utils.py
@@ -1,3 +1,4 @@
+from contextlib import suppress
 from typing import Any, overload
 
 import numpy as np
@@ -12,12 +13,12 @@ Embedding = list[np.float32] | np.ndarray[Any, np.dtype[np.float32]]
 
 
 @overload
-def get_embedding(input: str | TText) -> Embedding:
+def get_embedding(input: str | TText, config: Config) -> Embedding:
     ...
 
 
 @overload
-def get_embedding(input: list[str] | list[TText]) -> list[Embedding]:
+def get_embedding(input: list[str] | list[TText], config: Config) -> list[Embedding]:
     ...
 
 
@@ -37,9 +38,16 @@ def get_embedding(
 
     if isinstance(input, str):
         input = input.replace("\n", " ")
+
+        with suppress(NotImplementedError):
+            return _get_embedding_with_plugin(input, config)
+
     elif multiple and isinstance(input[0], str):
         input = [text.replace("\n", " ") for text in input]
 
+        with suppress(NotImplementedError):
+            return [_get_embedding_with_plugin(i, config) for i in input]
+
     model = config.embedding_model
     kwargs = {"model": model}
     kwargs.update(config.get_openai_credentials(model))
@@ -62,3 +70,13 @@ def get_embedding(
 
     embeddings = sorted(embeddings, key=lambda x: x["index"])
     return [d["embedding"] for d in embeddings]
+
+
+def _get_embedding_with_plugin(text: str, config: Config) -> Embedding:
+    for plugin in config.plugins:
+        if plugin.can_handle_text_embedding(text):
+            embedding = plugin.handle_text_embedding(text)
+            if embedding is not None:
+                return embedding
+
+    raise NotImplementedError
diff --git a/autogpt/models/base_open_ai_plugin.py b/autogpt/models/base_open_ai_plugin.py
index c0aac8ed2..60f6f91bf 100644
--- a/autogpt/models/base_open_ai_plugin.py
+++ b/autogpt/models/base_open_ai_plugin.py
@@ -198,18 +198,20 @@ class BaseOpenAIPlugin(AutoGPTPluginTemplate):
     def can_handle_text_embedding(self, text: str) -> bool:
         """This method is called to check that the plugin can
           handle the text_embedding method.
+
         Args:
             text (str): The text to be convert to embedding.
-          Returns:
-              bool: True if the plugin can handle the text_embedding method."""
+        Returns:
+            bool: True if the plugin can handle the text_embedding method."""
         return False
 
-    def handle_text_embedding(self, text: str) -> list:
-        """This method is called when the chat completion is done.
+    def handle_text_embedding(self, text: str) -> list[float]:
+        """This method is called to create a text embedding.
+
         Args:
             text (str): The text to be convert to embedding.
         Returns:
-            list: The text embedding.
+            list[float]: The created embedding vector.
         """
 
     def can_handle_user_input(self, user_input: str) -> bool:
diff --git a/tests/unit/models/test_base_open_api_plugin.py b/tests/unit/models/test_base_open_api_plugin.py
index 4d41eddd3..e656f4643 100644
--- a/tests/unit/models/test_base_open_api_plugin.py
+++ b/tests/unit/models/test_base_open_api_plugin.py
@@ -54,6 +54,7 @@ def test_dummy_plugin_default_methods(dummy_plugin):
     assert not dummy_plugin.can_handle_pre_command()
     assert not dummy_plugin.can_handle_post_command()
     assert not dummy_plugin.can_handle_chat_completion(None, None, None, None)
+    assert not dummy_plugin.can_handle_text_embedding(None)
 
     assert dummy_plugin.on_response("hello") == "hello"
     assert dummy_plugin.post_prompt(None) is None
@@ -77,3 +78,4 @@ def test_dummy_plugin_default_methods(dummy_plugin):
     assert isinstance(post_command, str)
     assert post_command == "upgraded successfully!"
     assert dummy_plugin.handle_chat_completion(None, None, None, None) is None
+    assert dummy_plugin.handle_text_embedding(None) is None
-- 
cgit v1.2.3


From 66fc7ccb31e88432abf1845a439210a36dd232cd Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 14 Jul 2023 18:26:17 -0700
Subject: Display smol-developer-results (#103)

---
 README.md            | 21 ++++++++++++++++++---
 agent/smol-developer |  2 +-
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index fa06317c5..727fefa41 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 A repo built for the purpose of benchmarking the performance of agents far and wide, regardless of how they are set up and how they work
 
 ## Scores:
-Spider chart for each agent coming soon !
+Radio chart for each agent coming soon !
 
 ## Detailed results
 :warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon.
@@ -42,7 +42,7 @@ Interface
 | Task        | Results            |
 |-------------|--------------------|
 | Write File  | :white_check_mark: |
-| Read File   | :white_check_mark: |
+| Read File   | :x:                |
 | Search File | :x:                |
 
 Code
@@ -58,4 +58,19 @@ Code
 Coming Soon!
 
 ### smol-developer
-Coming Soon!
+Interface
+
+| Task        | Results            |
+|-------------|--------------------|
+| Write File  | :white_check_mark: |
+| Read File   | :x:                |
+| Search File | :x:                |
+
+Code
+
+| Task                              | Results              |
+|-----------------------------------|----------------------|
+| Debug Simple Typo With Guidance   | :x:                  |
+| Debug Simple Typo Without Guidance| :x:                  |
+| Basic Code Generation             | :white_check_mark:   |
+| Create Simple Web Server          | :x:                  |
diff --git a/agent/smol-developer b/agent/smol-developer
index aa8233925..f4f439551 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit aa8233925090c0c9314ceef68397ab37baf17766
+Subproject commit f4f4395511ed6ba59ec09100d6596bf81d68a898
-- 
cgit v1.2.3


From 8be2a0b2e13972ed042485f3eca551b794434881 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Fri, 14 Jul 2023 18:45:24 -0700
Subject: Display results per category (#104)

---
 README.md | 67 +++++++++++++--------------------------------------------------
 1 file changed, 14 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 727fefa41..3011adedc 100644
--- a/README.md
+++ b/README.md
@@ -8,69 +8,30 @@ Radio chart for each agent coming soon !
 ## Detailed results
 :warning: These results are constantly evolving at the moment. We will publish an official benchmark result very soon.
 
-### Auto-GPT
 Interface
 
-| Task         | Results             |
-|--------------|---------------------|
-| Write File   | :white_check_mark:  |
-| Read File    | :white_check_mark:  |
-| Search File  | :x:                 |
+| Task         | Auto-GPT           | gpt-engineer       | mini-agi | smol-developer     |
+|--------------|--------------------|--------------------|----------|--------------------|
+| Write File   | :white_check_mark: | :white_check_mark: | tbd      | :white_check_mark: |
+| Read File    | :white_check_mark: | :x:                | tbd      | :x:                |
+| Search File  | :x:                | :x:                | tbd      | :x:                |
+
 
 Code
 
-| Task                              | Results              |
-|-----------------------------------|----------------------|
-| Debug Simple Typo With Guidance   | :x:                  |
-| Debug Simple Typo Without Guidance| :x:                  |
-| Basic Code Generation             | :white_check_mark:   |
-| Create Simple Web Server          | :x:                  |
+| Task                               | Auto-GPT           | gpt-engineer       | mini-agi | smol-developer     |
+|------------------------------------|--------------------|--------------------|----------|--------------------|
+| Debug Simple Typo With Guidance    | :x:                | :x:                | tbd      | :x:                |
+| Debug Simple Typo Without Guidance | :x:                | :x:                | tbd      | :x:                |
+| Basic Code Generation              | :white_check_mark: | :white_check_mark: | tbd      | :white_check_mark: |
+| Create Simple Web Server           | :x:                | :x:                | tbd      | :x:                |
+
 
 Memory
 
-| Task                                       | Results            |
+| Task                                       | Auto-GPT           |
 |--------------------------------------------|--------------------|
 | Basic Memory                               | :white_check_mark: |
 | Remember Multiple Ids                      | :x:                |
 | Remember Multiple Ids With Noise           | :x:                |
 | Remember Multiple Phrases With Noise       | :x:                |
-
-### gpt-engineer
-
-Interface
-
-| Task        | Results            |
-|-------------|--------------------|
-| Write File  | :white_check_mark: |
-| Read File   | :x:                |
-| Search File | :x:                |
-
-Code
-
-| Task                              | Results              |
-|-----------------------------------|----------------------|
-| Debug Simple Typo With Guidance   | :x:                  |
-| Debug Simple Typo Without Guidance| :x:                  |
-| Basic Code Generation             | :white_check_mark:   |
-| Create Simple Web Server          | :x:                  |
-
-### mini-agi
-Coming Soon!
-
-### smol-developer
-Interface
-
-| Task        | Results            |
-|-------------|--------------------|
-| Write File  | :white_check_mark: |
-| Read File   | :x:                |
-| Search File | :x:                |
-
-Code
-
-| Task                              | Results              |
-|-----------------------------------|----------------------|
-| Debug Simple Typo With Guidance   | :x:                  |
-| Debug Simple Typo Without Guidance| :x:                  |
-| Basic Code Generation             | :white_check_mark:   |
-| Create Simple Web Server          | :x:                  |
-- 
cgit v1.2.3


From bb654734167927b2d1e8673b6de13797dbad8dd6 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sat, 15 Jul 2023 08:57:28 -0700
Subject: Update Auto-GPT to current version of master (#105)

---
 agent/Auto-GPT | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index 9079f6641..357a918ec 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit 9079f66417f2480d0f5764fb0f916d3241b3fae8
+Subproject commit 357a918ecc9936207c70cf363bb95d74ec510e84
-- 
cgit v1.2.3


From dab4e90e157d65d5257880f1d818cd97a1b77030 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sat, 15 Jul 2023 09:53:56 -0700
Subject: Update Auto-GPT score (#106)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 README.md | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 3011adedc..368c79ee4 100644
--- a/README.md
+++ b/README.md
@@ -10,28 +10,28 @@ Radio chart for each agent coming soon !
 
 Interface
 
-| Task         | Auto-GPT           | gpt-engineer       | mini-agi | smol-developer     |
-|--------------|--------------------|--------------------|----------|--------------------|
-| Write File   | :white_check_mark: | :white_check_mark: | tbd      | :white_check_mark: |
-| Read File    | :white_check_mark: | :x:                | tbd      | :x:                |
-| Search File  | :x:                | :x:                | tbd      | :x:                |
+| Task         | Auto-GPT | gpt-engineer       | mini-agi | smol-developer     |
+|--------------|----------|--------------------|----------|--------------------|
+| Write File   | :x:      | :white_check_mark: | tbd      | :white_check_mark: |
+| Read File    | :x:      | :x:                | tbd      | :x:                |
+| Search File  | :x:      | :x:                | tbd      | :x:                |
 
 
 Code
 
-| Task                               | Auto-GPT           | gpt-engineer       | mini-agi | smol-developer     |
-|------------------------------------|--------------------|--------------------|----------|--------------------|
-| Debug Simple Typo With Guidance    | :x:                | :x:                | tbd      | :x:                |
-| Debug Simple Typo Without Guidance | :x:                | :x:                | tbd      | :x:                |
-| Basic Code Generation              | :white_check_mark: | :white_check_mark: | tbd      | :white_check_mark: |
-| Create Simple Web Server           | :x:                | :x:                | tbd      | :x:                |
+| Task                               | Auto-GPT | gpt-engineer       | mini-agi | smol-developer     |
+|------------------------------------|----------|--------------------|----------|--------------------|
+| Debug Simple Typo With Guidance    | :x:      | :x:                | tbd      | :x:                |
+| Debug Simple Typo Without Guidance | :x:      | :x:                | tbd      | :x:                |
+| Basic Code Generation              | :x:      | :white_check_mark: | tbd      | :white_check_mark: |
+| Create Simple Web Server           | :x:      | :x:                | tbd      | :x:                |
 
 
 Memory
 
-| Task                                       | Auto-GPT           |
-|--------------------------------------------|--------------------|
-| Basic Memory                               | :white_check_mark: |
-| Remember Multiple Ids                      | :x:                |
-| Remember Multiple Ids With Noise           | :x:                |
-| Remember Multiple Phrases With Noise       | :x:                |
+| Task                                       | Auto-GPT |
+|--------------------------------------------|----------|
+| Basic Memory                               | :x:      |
+| Remember Multiple Ids                      | :x:      |
+| Remember Multiple Ids With Noise           | :x:      |
+| Remember Multiple Phrases With Noise       | :x:      |
-- 
cgit v1.2.3


From cbd2e49d973a344e9fce1e55e4ed4bf7e9c26e57 Mon Sep 17 00:00:00 2001
From: Erik Peterson <e@eriklp.com>
Date: Sat, 15 Jul 2023 16:23:49 -0700
Subject: Clean up workspace between each test (#109)

---
 agbenchmark/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 32151b8ad..952588105 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -53,7 +53,7 @@ def config(request: Any) -> None:
     return config
 
 
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def workspace(config: Dict[str, Any]) -> Generator[str, None, None]:
     output_path = config["workspace"]
 
-- 
cgit v1.2.3


From 5886d7505914a53ad47f0e41087581e187178ae6 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sat, 15 Jul 2023 16:52:42 -0700
Subject: Add three sum challenge (#108)

Co-authored-by: Silen Naihin <silen.naihin@gmail.com>
---
 .github/workflows/ci.yml                           |  9 ++++---
 agbenchmark/challenges/code/d1/data.json           |  2 +-
 agbenchmark/challenges/code/d2/data.json           |  2 +-
 agbenchmark/challenges/code/d4/data.json           |  2 +-
 .../challenges/code/d5/artifacts_out/__init__.py   |  0
 .../challenges/code/d5/artifacts_out/code.py       | 23 ++++++++++++++++
 .../challenges/code/d5/custom_python/test.py       | 31 ++++++++++++++++++++++
 agbenchmark/challenges/code/d5/data.json           | 18 +++++++++++++
 agent/gpt-engineer                                 |  2 +-
 9 files changed, 82 insertions(+), 7 deletions(-)
 create mode 100644 agbenchmark/challenges/code/d5/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d5/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d5/custom_python/test.py
 create mode 100644 agbenchmark/challenges/code/d5/data.json

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b7864db6a..dbb0a6ace 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -64,7 +64,7 @@ jobs:
         if: success() || failure()
 
   tests:
-    name: ${{ matrix.agent-name }}
+    name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})"
     runs-on: ubuntu-latest
     timeout-minutes: 10
     env:
@@ -77,6 +77,8 @@ jobs:
           - "smol-developer"
           - "Auto-GPT"
           - "mini-agi"
+        cache-enabled: [ true, false ]
+
     steps:
       - name: Checkout repository
         uses: actions/checkout@v3
@@ -156,7 +158,7 @@ jobs:
           PROMPT_USER: false # For mini-agi. TODO: Remove this once mini-agi follows the standards.
           HELICONE_API_KEY: ${{ secrets.HELICONE_API_KEY }}
           REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
-          HELICONE_CACHE_ENABLED: true
+          HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }}
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
 
 
@@ -164,5 +166,6 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v3
         with:
-          name: ${{ matrix.agent-name }}
+          name:
+            ${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})
           path: agent/${{ matrix.agent-name }}/agbenchmark
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
index 061c924f5..bc1a15b42 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -1,6 +1,6 @@
 {
   "name": "TestDebugSimpleTypoWithGuidance",
-  "category": ["code"],
+  "category": ["code", "iterate"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": ["TestReadFile", "TestWriteFile"],
   "ground": {
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
index 6523ef1d8..fca86f29b 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -1,6 +1,6 @@
 {
   "name": "TestDebugSimpleTypoWithoutGuidance",
-  "category": ["code"],
+  "category": ["code", "iterate"],
   "task": "Make test.py run without errors.",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
   "ground": {
diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json
index 51f6f2702..b2320a4e5 100644
--- a/agbenchmark/challenges/code/d4/data.json
+++ b/agbenchmark/challenges/code/d4/data.json
@@ -1,6 +1,6 @@
 {
   "name": "TestBasicCodeGeneration",
-  "category": ["code", "iterate"],
+  "category": ["code"],
   "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
   "dependencies": ["TestWriteFile"],
   "ground": {
diff --git a/agbenchmark/challenges/code/d5/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d5/artifacts_out/code.py b/agbenchmark/challenges/code/d5/artifacts_out/code.py
new file mode 100644
index 000000000..6056691da
--- /dev/null
+++ b/agbenchmark/challenges/code/d5/artifacts_out/code.py
@@ -0,0 +1,23 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def three_sum(nums: List[int], target: int) -> Optional[List[int]]:
+    nums_indices = [(num, index) for index, num in enumerate(nums)]
+    nums_indices.sort()
+    for i in range(len(nums_indices) - 2):
+        if i > 0 and nums_indices[i] == nums_indices[i - 1]:
+            continue
+        l, r = i + 1, len(nums_indices) - 1
+        while l < r:
+            three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0]
+            if three_sum < target:
+                l += 1
+            elif three_sum > target:
+                r -= 1
+            else:
+                indices = sorted(
+                    [nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]]
+                )
+                return indices
+    return None
diff --git a/agbenchmark/challenges/code/d5/custom_python/test.py b/agbenchmark/challenges/code/d5/custom_python/test.py
new file mode 100644
index 000000000..761b9f5c6
--- /dev/null
+++ b/agbenchmark/challenges/code/d5/custom_python/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import three_sum
+from typing import List
+
+
+def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
+    result = three_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first three numbers
+    nums = [2, 7, 11, 15]
+    target = 20
+    expected_result = [0, 1, 2]
+    test_three_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 2
+    expected_result = [0, 2, 5]
+    test_three_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = 9
+    expected_result = [0, 2, 3]
+    test_three_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json
new file mode 100644
index 000000000..4b44c6943
--- /dev/null
+++ b/agbenchmark/challenges/code/d5/data.json
@@ -0,0 +1,18 @@
+{
+  "name": "TestThreeSum",
+  "category": ["code", "iterate"],
+  "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+  "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"],
+  "ground": {
+    "answer": "The three_sum function coded properly.",
+    "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "intermediate",
+    "description": "Tests ability for the agent to create the three_sum function.",
+    "side_effects": []
+  }
+}
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index bca191cd7..f0c76918d 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit bca191cd76cdea0335da91d004c64d9bb8520fea
+Subproject commit f0c76918dff7a6cf5e0611a09b060fc5d4913b82
-- 
cgit v1.2.3


From 02dce4193780ba6d4c0225b3c21da16ecca51ab4 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sat, 15 Jul 2023 18:00:37 -0700
Subject: Fix ci (#110)

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index dbb0a6ace..7f6959807 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -149,7 +149,7 @@ jobs:
             agbenchmark start --mock --category=iterate
           else
             curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
-            agbenchmark start --maintain
+            agbenchmark start | echo "This command will always return a non zero exit code unless all the challenges are solved."
           fi
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
@@ -167,5 +167,5 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name:
-            ${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})
+            "${{ matrix.agent-name }} (Cache ${{ matrix.cache-enabled }})"
           path: agent/${{ matrix.agent-name }}/agbenchmark
-- 
cgit v1.2.3


From 757baba3ff61f354359720667e136e40a54ae7f0 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sat, 15 Jul 2023 18:09:29 -0700
Subject: Remove cache true on pr (#111)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7f6959807..907c21267 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -64,6 +64,7 @@ jobs:
         if: success() || failure()
 
   tests:
+    if: github.event_name != 'pull_request' || matrix.cache-enabled == false
     name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})"
     runs-on: ubuntu-latest
     timeout-minutes: 10
-- 
cgit v1.2.3


From 9f3a2d4f05702bf44b0b938582c5dd6f9a459ea2 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sat, 15 Jul 2023 22:10:20 -0400
Subject: Dynamic cutoff and other quality of life (#101)

---
 .gitmodules                                        |   2 +-
 agbenchmark/agent_interface.py                     |  12 +-
 agbenchmark/challenge.py                           |  11 +-
 agbenchmark/challenges/code/d1/data.json           |   1 +
 agbenchmark/challenges/code/d2/data.json           |   1 +
 agbenchmark/challenges/code/d3/data.json           |   1 +
 agbenchmark/challenges/code/d4/data.json           |   1 +
 agbenchmark/challenges/code/d5/data.json           |   1 +
 agbenchmark/challenges/define_task_types.py        |   1 +
 .../challenges/interface/read_file/data.json       |   1 +
 agbenchmark/challenges/interface/search/data.json  |   1 +
 .../challenges/interface/write_file/data.json      |   1 +
 agbenchmark/challenges/memory/m1/data.json         |   1 +
 agbenchmark/challenges/memory/m2/data.json         |   1 +
 agbenchmark/challenges/memory/m3/data.json         |   1 +
 agbenchmark/challenges/memory/m4/data.json         |   1 +
 agbenchmark/challenges/retrieval/r1/data.json      |   1 +
 agbenchmark/challenges/retrieval/r2/data.json      |   1 +
 agbenchmark/challenges/retrieval/r3/data.json      |   1 +
 agbenchmark/challenges/test_all.py                 |   3 +-
 agbenchmark/config.json                            |   3 +-
 agbenchmark/conftest.py                            |  33 ++---
 agbenchmark/internal_info.json                     |   8 +-
 agbenchmark/regression_tests.json                  |  19 +--
 agbenchmark/reports/1.json                         | 148 ---------------------
 agbenchmark/reports/file1_07-14-18-54.json         | 147 ++++++++++++++++++++
 agbenchmark/start_benchmark.py                     |   2 +-
 agbenchmark/utils.py                               |  15 ++-
 agent/Auto-GPT                                     |   2 +-
 agent/SuperAGI                                     |   2 +-
 agent/config_example.json                          |   3 +-
 agent/gpt-engineer                                 |   2 +-
 agent/mini-agi                                     |   2 +-
 agent/smol-developer                               |   2 +-
 34 files changed, 221 insertions(+), 211 deletions(-)
 delete mode 100644 agbenchmark/reports/1.json
 create mode 100644 agbenchmark/reports/file1_07-14-18-54.json

diff --git a/.gitmodules b/.gitmodules
index f14b5e07d..d2b71f9c4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "agent/Auto-GPT"]
 	path = agent/Auto-GPT
-	url = https://github.com/Significant-Gravitas/Auto-GPT.git
+	url = https://github.com/merwanehamadi/Auto-GPT.git
 	branch = benchmark-integration
 [submodule "agent/gpt-engineer"]
 	path = agent/gpt-engineer
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 991a7e8e0..897f4f8cf 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -16,9 +16,7 @@ MOCK_FLAG = mock_test_str.lower() == "true" if mock_test_str else False
 
 
 def run_agent(
-    task: str,
-    config: Dict[str, Any],
-    challenge_location: str,
+    task: str, config: Dict[str, Any], challenge_location: str, cutoff: int
 ) -> None:
     """Calling to get a response"""
 
@@ -27,9 +25,7 @@ def run_agent(
             config["workspace"], "artifacts_out", challenge_location
         )
     else:
-        print(
-            f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
-        )
+        print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}")
         command = [sys.executable, "-m", config["entry_path"], str(task)]
         process = subprocess.Popen(
             command,
@@ -50,11 +46,11 @@ def run_agent(
             if (
                 process.poll() is not None
                 or output == ""
-                or (time.time() - start_time > config["cutoff"])
+                or (time.time() - start_time > cutoff)
             ):
                 break
 
-        if time.time() - start_time > config["cutoff"]:
+        if time.time() - start_time > cutoff:
             print("The Python function has exceeded the time limit and was terminated.")
             process.kill()
         else:
diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index f07faf8ee..4f24bb603 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -4,16 +4,9 @@ import subprocess
 from abc import ABC
 from typing import Any, Dict, List
 
-from dotenv import load_dotenv
-
 from agbenchmark.challenges.define_task_types import ChallengeData, Ground
 from agbenchmark.start_benchmark import CURRENT_DIRECTORY
 
-load_dotenv()
-
-mock_test_str = os.getenv("MOCK_TEST")
-MOCK_TEST = mock_test_str.lower() == "true" if mock_test_str else False
-
 
 class Challenge(ABC):
     """The parent class to all specific challenges classes.
@@ -37,14 +30,14 @@ class Challenge(ABC):
     def dependencies(self) -> list:
         return self.data.dependencies
 
-    def setup_challenge(self, config: Dict[str, Any]) -> None:
+    def setup_challenge(self, config: Dict[str, Any], cutoff: int) -> None:
         from agbenchmark.agent_interface import copy_artifacts_into_workspace, run_agent
 
         copy_artifacts_into_workspace(
             config["workspace"], "artifacts_in", self.CHALLENGE_LOCATION
         )
 
-        run_agent(self.task, config, self.CHALLENGE_LOCATION)
+        run_agent(self.task, config, self.CHALLENGE_LOCATION, cutoff)
 
         # hidden files are added after the agent runs. Hidden files can be python test files.
         # We copy them in the workspace to make it easy to import the code produced by the agent
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
index bc1a15b42..d8e0280a4 100644
--- a/agbenchmark/challenges/code/d1/data.json
+++ b/agbenchmark/challenges/code/d1/data.json
@@ -3,6 +3,7 @@
   "category": ["code", "iterate"],
   "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
   "dependencies": ["TestReadFile", "TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
index fca86f29b..de32ef9a7 100644
--- a/agbenchmark/challenges/code/d2/data.json
+++ b/agbenchmark/challenges/code/d2/data.json
@@ -3,6 +3,7 @@
   "category": ["code", "iterate"],
   "task": "Make test.py run without errors.",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "cutoff": 60,
   "ground": {
     "answer": "[0, 1] [2, 5] [0, 3]",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json
index ae0e45581..c5d111a4d 100644
--- a/agbenchmark/challenges/code/d3/data.json
+++ b/agbenchmark/challenges/code/d3/data.json
@@ -3,6 +3,7 @@
   "category": ["code"],
   "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
   "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "cutoff": 60,
   "ground": {
     "answer": "GET localhost:8079/health responds with a 200 OK",
     "should_contain": [],
diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json
index b2320a4e5..e8db918d2 100644
--- a/agbenchmark/challenges/code/d4/data.json
+++ b/agbenchmark/challenges/code/d4/data.json
@@ -3,6 +3,7 @@
   "category": ["code"],
   "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
   "dependencies": ["TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "The two_sum function coded properly.",
     "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json
index 4b44c6943..434b1312e 100644
--- a/agbenchmark/challenges/code/d5/data.json
+++ b/agbenchmark/challenges/code/d5/data.json
@@ -3,6 +3,7 @@
   "category": ["code", "iterate"],
   "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
   "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"],
+  "cutoff": 60,
   "ground": {
     "answer": "The three_sum function coded properly.",
     "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"],
diff --git a/agbenchmark/challenges/define_task_types.py b/agbenchmark/challenges/define_task_types.py
index 668025dd2..dc1777d71 100644
--- a/agbenchmark/challenges/define_task_types.py
+++ b/agbenchmark/challenges/define_task_types.py
@@ -61,6 +61,7 @@ class ChallengeData(BaseModel):
     category: List[str]
     task: str
     dependencies: List[str]
+    cutoff: int
     ground: Ground
     info: Info
 
diff --git a/agbenchmark/challenges/interface/read_file/data.json b/agbenchmark/challenges/interface/read_file/data.json
index 1bf340454..55878856b 100644
--- a/agbenchmark/challenges/interface/read_file/data.json
+++ b/agbenchmark/challenges/interface/read_file/data.json
@@ -3,6 +3,7 @@
   "category": ["interface"],
   "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
   "dependencies": ["TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "random string Hello World!",
     "should_contain": ["random string", "Hello World!"],
diff --git a/agbenchmark/challenges/interface/search/data.json b/agbenchmark/challenges/interface/search/data.json
index de8934d95..793e71715 100644
--- a/agbenchmark/challenges/interface/search/data.json
+++ b/agbenchmark/challenges/interface/search/data.json
@@ -3,6 +3,7 @@
   "category": ["interface"],
   "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
   "dependencies": ["TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "This is a Heading\nThis is a paragraph.",
     "should_contain": ["Heading", "paragraph"],
diff --git a/agbenchmark/challenges/interface/write_file/data.json b/agbenchmark/challenges/interface/write_file/data.json
index 8db9cd620..426e6c3d4 100644
--- a/agbenchmark/challenges/interface/write_file/data.json
+++ b/agbenchmark/challenges/interface/write_file/data.json
@@ -3,6 +3,7 @@
   "category": ["interface"],
   "task": "Print the the capital of America to a .txt file",
   "dependencies": [],
+  "cutoff": 60,
   "ground": {
     "answer": "Washington",
     "should_contain": ["Washington"],
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
index ab86f1c3c..33c523244 100644
--- a/agbenchmark/challenges/memory/m1/data.json
+++ b/agbenchmark/challenges/memory/m1/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestReadFile", "TestWriteFile"],
+  "cutoff": 60,
   "ground": {
     "answer": "2314",
     "should_contain": ["2314"],
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
index 9205c99f1..840e8dc83 100644
--- a/agbenchmark/challenges/memory/m2/data.json
+++ b/agbenchmark/challenges/memory/m2/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestBasicMemory"],
+  "cutoff": 60,
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
index 3b78d614b..3af2fb3a5 100644
--- a/agbenchmark/challenges/memory/m3/data.json
+++ b/agbenchmark/challenges/memory/m3/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestRememberMultipleIds"],
+  "cutoff": 60,
   "ground": {
     "answer": "3145\n3791\n9317\n9471",
     "should_contain": ["3145", "3791", "9317", "9471"],
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
index 84f5c2b21..17a078e1b 100644
--- a/agbenchmark/challenges/memory/m4/data.json
+++ b/agbenchmark/challenges/memory/m4/data.json
@@ -3,6 +3,7 @@
   "category": ["memory"],
   "task": "Follow the instructions in the instructions_1.txt file",
   "dependencies": ["TestRememberMultipleIdsWithNoise"],
+  "cutoff": 60,
   "ground": {
     "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
     "should_contain": [
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
index e3e09302d..c3af4862d 100644
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ b/agbenchmark/challenges/retrieval/r1/data.json
@@ -3,6 +3,7 @@
   "category": ["retrieval"],
   "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
   "dependencies": ["TestWriteFile", "TestSearch"],
+  "cutoff": 60,
   "ground": {
     "answer": "£25.89",
     "should_contain": ["25.89"],
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
index 977be4bcd..f558b8584 100644
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ b/agbenchmark/challenges/retrieval/r2/data.json
@@ -3,6 +3,7 @@
   "category": ["retrieval"],
   "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
   "dependencies": ["TestBasicRetrieval"],
+  "cutoff": 60,
   "ground": {
     "answer": "81,462",
     "should_contain": ["81,462"],
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index 5504908ea..eb998ffbf 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -3,6 +3,7 @@
   "category": ["retrieval"],
   "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
   "dependencies": ["TestRetrieval2"],
+  "cutoff": 60,
   "ground": {
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
     "should_contain": [
diff --git a/agbenchmark/challenges/test_all.py b/agbenchmark/challenges/test_all.py
index 98a5ab81a..255b39e57 100644
--- a/agbenchmark/challenges/test_all.py
+++ b/agbenchmark/challenges/test_all.py
@@ -54,7 +54,8 @@ def generate_tests() -> None:
 
         # Define test method within the dynamically created class
         def test_method(self, config: Dict[str, Any]) -> None:  # type: ignore
-            self.setup_challenge(config)
+            cutoff = self.data.cutoff or 60
+            self.setup_challenge(config, cutoff)
 
             scores = self.get_scores(config)
 
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index af83029ef..820f133b1 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,5 +1,4 @@
 {
   "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "agbenchmark.benchmarks",
-  "cutoff": 60
+  "entry_path": "agbenchmark.benchmarks"
 }
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 952588105..245df485e 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -18,12 +18,10 @@ from agbenchmark.start_benchmark import (
 from agbenchmark.utils import calculate_success_percentage
 
 
-def resolve_workspace(config: Dict[str, Any]) -> str:
-    if config.get("workspace", "").startswith("${") and config.get(
-        "workspace", ""
-    ).endswith("}"):
+def resolve_workspace(workspace: str) -> str:
+    if workspace.startswith("${") and workspace.endswith("}"):
         # Extract the string inside ${...}
-        path_expr = config["workspace"][2:-1]
+        path_expr = workspace[2:-1]
 
         # Check if it starts with "os.path.join"
         if path_expr.strip().startswith("os.path.join"):
@@ -35,7 +33,7 @@ def resolve_workspace(config: Dict[str, Any]) -> str:
         else:
             raise ValueError("Invalid workspace path expression.")
     else:
-        return os.path.abspath(Path(os.getcwd()) / config["workspace"])
+        return os.path.abspath(Path(os.getcwd()) / workspace)
 
 
 @pytest.fixture(scope="module")
@@ -45,10 +43,10 @@ def config(request: Any) -> None:
         config = json.load(f)
 
     if isinstance(config["workspace"], str):
-        config["workspace"] = resolve_workspace(config)
+        config["workspace"] = resolve_workspace(config["workspace"])
     else:  # it's a input output dict
-        config["workspace"]["input"] = resolve_workspace(config)
-        config["workspace"]["output"] = resolve_workspace(config)
+        config["workspace"]["input"] = resolve_workspace(config["workspace"]["input"])
+        config["workspace"]["output"] = resolve_workspace(config["workspace"]["output"])
 
     return config
 
@@ -173,18 +171,21 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
                 regression_manager.remove_test(test_name)
             info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
 
-        prev_test_results: list[bool] = []
-
+        prev_test_results: list[bool] = internal_info.tests.get(test_name, [])
         if not mock:
             # only add if it's an actual test
-            prev_test_results = internal_info.tests.get(test_name, [])
             prev_test_results.append(info_details["metrics"]["success"])
             internal_info.add_test(test_name, prev_test_results)
 
-        # can calculate success rate regardless of mock
-        info_details["metrics"]["success_%"] = calculate_success_percentage(
-            prev_test_results
-        )
+            # can calculate success rate regardless of mock
+            info_details["metrics"]["success_%"] = calculate_success_percentage(
+                prev_test_results
+            )
+        else:
+            # can calculate success rate regardless of mock
+            info_details["metrics"][
+                "non_mock_success_%"
+            ] = calculate_success_percentage(prev_test_results)
 
         if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
             # if the last 3 tests were successful, add to the regression tests
diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json
index 5f46bd854..95a051d54 100644
--- a/agbenchmark/internal_info.json
+++ b/agbenchmark/internal_info.json
@@ -62,6 +62,12 @@
     "TestWriteFile": [
         true,
         true,
-        true
+        true,
+        false,
+        false,
+        false,
+        false,
+        true,
+        false
     ]
 }
\ No newline at end of file
diff --git a/agbenchmark/regression_tests.json b/agbenchmark/regression_tests.json
index ce73ce263..25591a4de 100644
--- a/agbenchmark/regression_tests.json
+++ b/agbenchmark/regression_tests.json
@@ -16,57 +16,52 @@
         "data_path": "agbenchmark/challenges/retrieval/r1"
     },
     "TestReadFile": {
-        "difficulty": "basic",
+        "difficulty": "interface",
         "dependencies": [
             "TestWriteFile"
         ],
         "data_path": "agbenchmark/challenges/interface/read_file"
     },
     "TestRememberMultipleIds": {
-        "difficulty": "basic",
+        "difficulty": "novice",
         "dependencies": [
             "TestBasicMemory"
         ],
         "data_path": "agbenchmark/challenges/memory/m2"
     },
     "TestRememberMultipleIdsWithNoise": {
-        "difficulty": "medium",
+        "difficulty": "intermediate",
         "dependencies": [
             "TestRememberMultipleIds"
         ],
         "data_path": "agbenchmark/challenges/memory/m3"
     },
     "TestRememberMultiplePhrasesWithNoise": {
-        "difficulty": "medium",
+        "difficulty": "advanced",
         "dependencies": [
             "TestRememberMultipleIdsWithNoise"
         ],
         "data_path": "agbenchmark/challenges/memory/m4"
     },
     "TestRetrieval2": {
-        "difficulty": "basic",
+        "difficulty": "novice",
         "dependencies": [
             "TestBasicRetrieval"
         ],
         "data_path": "agbenchmark/challenges/retrieval/r2"
     },
     "TestRetrieval3": {
-        "difficulty": "basic",
+        "difficulty": "intermediate",
         "dependencies": [
             "TestRetrieval2"
         ],
         "data_path": "agbenchmark/challenges/retrieval/r3"
     },
     "TestSearch": {
-        "difficulty": "basic",
+        "difficulty": "interface",
         "dependencies": [
             "TestWriteFile"
         ],
         "data_path": "agbenchmark/challenges/interface/search"
-    },
-    "TestWriteFile": {
-        "difficulty": "basic",
-        "dependencies": [],
-        "data_path": "agbenchmark/challenges/interface/write_file"
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/reports/1.json b/agbenchmark/reports/1.json
deleted file mode 100644
index 45945a3ee..000000000
--- a/agbenchmark/reports/1.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-    "command": "agbenchmark start --mock",
-    "completion_time": "2023-07-11-21:09",
-    "metrics": {
-        "run_time": "0.96 seconds",
-        "highest_difficulty": "advanced: 5"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.008 seconds"
-            }
-        },
-        "TestReadFile": {
-            "data_path": "agbenchmark/challenges/interface/read_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.005 seconds"
-            }
-        },
-        "TestSearch": {
-            "data_path": "agbenchmark/challenges/interface/search",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.006 seconds"
-            }
-        },
-        "TestDebugSimpleTypoWithGuidance": {
-            "data_path": "agbenchmark/challenges/code/d1",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "basic",
-                "success": false,
-                "fail_reason": "assert 1 in [0.0]",
-                "success_%": 0,
-                "run_time": "0.489 seconds"
-            }
-        },
-        "TestBasicMemory": {
-            "data_path": "agbenchmark/challenges/memory/m1",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "basic",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.02 seconds"
-            }
-        },
-        "TestBasicRetrieval": {
-            "data_path": "agbenchmark/challenges/retrieval/r1",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "basic",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.01 seconds"
-            }
-        },
-        "TestDebugSimpleTypoWithoutGuidance": {
-            "data_path": "agbenchmark/challenges/code/d2",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "novice",
-                "success": false,
-                "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "success_%": 0,
-                "run_time": "0.001 seconds"
-            }
-        },
-        "TestCreateSimpleWebServer": {
-            "data_path": "agbenchmark/challenges/code/d3",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "advanced",
-                "success": false,
-                "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "success_%": 0,
-                "run_time": "0.001 seconds"
-            }
-        },
-        "TestRememberMultipleIds": {
-            "data_path": "agbenchmark/challenges/memory/m2",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "novice",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.018 seconds"
-            }
-        },
-        "TestRetrieval2": {
-            "data_path": "agbenchmark/challenges/retrieval/r2",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "novice",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.009 seconds"
-            }
-        },
-        "TestRememberMultipleIdsWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m3",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.022 seconds"
-            }
-        },
-        "TestRetrieval3": {
-            "data_path": "agbenchmark/challenges/retrieval/r3",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.01 seconds"
-            }
-        },
-        "TestRememberMultiplePhrasesWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m4",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "advanced",
-                "success": true,
-                "success_%": 0,
-                "run_time": "0.021 seconds"
-            }
-        }
-    },
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-        "entry_path": "agbenchmark.benchmarks",
-        "cutoff": 60
-    }
-}
\ No newline at end of file
diff --git a/agbenchmark/reports/file1_07-14-18-54.json b/agbenchmark/reports/file1_07-14-18-54.json
new file mode 100644
index 000000000..f81d19d3d
--- /dev/null
+++ b/agbenchmark/reports/file1_07-14-18-54.json
@@ -0,0 +1,147 @@
+{
+    "command": "agbenchmark start --mock",
+    "completion_time": "2023-07-14-18:54",
+    "metrics": {
+        "run_time": "0.97 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "non_mock_success_%": 75.0,
+                "run_time": "0.007 seconds"
+            }
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.008 seconds"
+            }
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.007 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "non_mock_success_%": 0.0,
+                "run_time": "0.448 seconds"
+            }
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.028 seconds"
+            }
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.014 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "non_mock_success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "non_mock_success_%": 0.0,
+                "run_time": "0.002 seconds"
+            }
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.023 seconds"
+            }
+        },
+        "TestRetrieval2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.013 seconds"
+            }
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.03 seconds"
+            }
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.016 seconds"
+            }
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "non_mock_success_%": 100.0,
+                "run_time": "0.034 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+        "entry_path": "agbenchmark.benchmarks"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index ab2586e60..b31c9f5f9 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -62,7 +62,7 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -
 
         config["entry_path"] = click.prompt(
             "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
-            default="benchmarks.py",
+            default="agbenchmark/benchmarks.py",
         )
 
         config["cutoff"] = click.prompt(
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index 598113d3d..1174e89bb 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1,6 +1,7 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
 import glob
 import re
+from datetime import datetime
 from pathlib import Path
 from typing import Any
 
@@ -12,11 +13,13 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
 
     if not INFO_TESTS_PATH.exists():
         INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
-        return str(INFO_TESTS_PATH / "1.json")
+        return str(
+            INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
+        )
     else:
         json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
         file_count = len(json_files)
-        run_name = f"{file_count + 1}.json"
+        run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
         new_file_path = INFO_TESTS_PATH / run_name
         return str(new_file_path)
 
@@ -35,8 +38,10 @@ def replace_backslash(value: Any) -> Any:
 
 
 def calculate_success_percentage(results: list[bool]) -> float:
-    success_count = results.count(True)
-    total_count = len(results)
+    # Take the last 10 results or all if less than 10
+    last_results = results[-10:] if len(results) > 10 else results
+    success_count = last_results.count(True)
+    total_count = len(last_results)
     if total_count == 0:
         return 0
     success_percentage = (success_count / total_count) * 100  # as a percentage
@@ -45,7 +50,7 @@ def calculate_success_percentage(results: list[bool]) -> float:
 
 def get_highest_success_difficulty(data: dict) -> str:
     highest_difficulty = None
-    highest_difficulty_level = -1
+    highest_difficulty_level = 0
 
     for test_name, test_data in data.items():
         if test_data["metrics"]["success"]:
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index 357a918ec..62ad7aa8c 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit 357a918ecc9936207c70cf363bb95d74ec510e84
+Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16
diff --git a/agent/SuperAGI b/agent/SuperAGI
index bd4b3def6..f880b2464 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit bd4b3def65e964182b05bb9f7a350b00f55a6007
+Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b
diff --git a/agent/config_example.json b/agent/config_example.json
index 7ab65bc20..9e8bd3f08 100644
--- a/agent/config_example.json
+++ b/agent/config_example.json
@@ -1,5 +1,4 @@
 {
   "workspace": "projects/my-new-project/workspace",
-  "entry_path": "agbenchmark/benchmarks.py",
-  "cutoff": 60
+  "entry_path": "agbenchmark/benchmarks.py"
 }
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index f0c76918d..a0162df0d 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit f0c76918dff7a6cf5e0611a09b060fc5d4913b82
+Subproject commit a0162df0db24be0c888ad56d12bd59d6130d32f0
diff --git a/agent/mini-agi b/agent/mini-agi
index 08764876d..0f8eba95d 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 08764876d9a5c84c9f9e879088854d2b9349d7a0
+Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9
diff --git a/agent/smol-developer b/agent/smol-developer
index f4f439551..70b57dd04 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit f4f4395511ed6ba59ec09100d6596bf81d68a898
+Subproject commit 70b57dd042bea14d6e21d56e9e115ee0fc9676f7
-- 
cgit v1.2.3


From 2704bcee5ef86eb3da75139a08f618135f66d754 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 16 Jul 2023 07:26:36 -0700
Subject: Allow change location of reports (#115)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/ci.yml | 7 +++----
 agbenchmark/utils.py     | 6 +++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 907c21267..9d4769e76 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -64,7 +64,6 @@ jobs:
         if: success() || failure()
 
   tests:
-    if: github.event_name != 'pull_request' || matrix.cache-enabled == false
     name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})"
     runs-on: ubuntu-latest
     timeout-minutes: 10
@@ -161,12 +160,12 @@ jobs:
           REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
           HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }}
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
+          REPORT_LOCATION:  ${{ matrix.cache-enabled == true && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }}
 
 
       - name: Upload reports
         if: always()
         uses: actions/upload-artifact@v3
         with:
-          name:
-            "${{ matrix.agent-name }} (Cache ${{ matrix.cache-enabled }})"
-          path: agent/${{ matrix.agent-name }}/agbenchmark
+          name: ${{ matrix.agent-name }}
+          path: benchmark_runs/${{ matrix.agent-name }}
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index 1174e89bb..506c48847 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1,5 +1,6 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
 import glob
+import os
 import re
 from datetime import datetime
 from pathlib import Path
@@ -9,7 +10,10 @@ from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyL
 
 
 def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
-    INFO_TESTS_PATH = benchmarks_folder_path / "reports"
+
+    INFO_TESTS_PATH = (
+        benchmarks_folder_path / os.getenv("REPORT_LOCATION", ".") / "reports"
+    )
 
     if not INFO_TESTS_PATH.exists():
         INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
-- 
cgit v1.2.3


From 2cfafcfbf02c85182f68e6faa5d50c342f340faa Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 16 Jul 2023 07:54:49 -0700
Subject: Fix cutoff errors (#116)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 agent/gpt-engineer   | 2 +-
 agent/smol-developer | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index a0162df0d..9bb81041a 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit a0162df0db24be0c888ad56d12bd59d6130d32f0
+Subproject commit 9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36
diff --git a/agent/smol-developer b/agent/smol-developer
index 70b57dd04..a23d01369 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit 70b57dd042bea14d6e21d56e9e115ee0fc9676f7
+Subproject commit a23d01369cea976e80b7889fdbf1096619471301
-- 
cgit v1.2.3


From 117e8c8dd1879dd97fe9933fc2bf9a6b2cd65a92 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 16 Jul 2023 08:10:53 -0700
Subject: Fix pipes issue (#117)

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9d4769e76..aca2e3f5a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -149,7 +149,7 @@ jobs:
             agbenchmark start --mock --category=iterate
           else
             curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
-            agbenchmark start | echo "This command will always return a non zero exit code unless all the challenges are solved."
+            agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved."
           fi
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
-- 
cgit v1.2.3


From b904041ea17829f4fd522a794d5a7b06b95c923b Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Sun, 16 Jul 2023 15:49:36 -0700
Subject: Update reports when pushing to master (#162)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/ci.yml | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index aca2e3f5a..a2224ea78 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,8 @@ on:
     - cron: "0 8 * * *"
   push:
     branches: [master, ci-test*]
+    paths-ignore:
+      - 'benchmark_runs/**'
   pull_request:
     branches: [stable, master, release-*]
 
@@ -64,11 +66,12 @@ jobs:
         if: success() || failure()
 
   tests:
+    env:
+      GH_TOKEN: ${{ github.event_name == 'pull_request' && github.token || secrets.PAT }}
+      min-python-version: "3.10"
     name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})"
     runs-on: ubuntu-latest
     timeout-minutes: 10
-    env:
-      min-python-version: "3.10"
     strategy:
       fail-fast: false
       matrix:
@@ -87,6 +90,7 @@ jobs:
           ref: ${{ github.event.pull_request.head.ref }}
           repository: ${{ github.event.pull_request.head.repo.full_name }}
           submodules: true
+          token: ${{ env.GH_TOKEN }}
 
       - name: Set up Python ${{ env.min-python-version }}
         uses: actions/setup-python@v2
@@ -151,6 +155,9 @@ jobs:
             curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
             agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved."
           fi
+
+          cd ../..
+
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -160,7 +167,7 @@ jobs:
           REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
           HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }}
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
-          REPORT_LOCATION:  ${{ matrix.cache-enabled == true && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }}
+          REPORT_LOCATION:  ${{ matrix.cache-enabled == false && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }}
 
 
       - name: Upload reports
@@ -169,3 +176,18 @@ jobs:
         with:
           name: ${{ matrix.agent-name }}
           path: benchmark_runs/${{ matrix.agent-name }}
+
+      - name: Authenticate and Push to Branch
+        if: (success() || failure()) && (github.event_name != 'pull_request' && matrix.cache-enabled == false)
+        run: |
+          git config --global user.email "github-bot@agpt.co"
+          git config --global user.name "Auto-GPT-Bot"
+      
+          git add benchmark_runs/* || echo "nothing to commit"
+          commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')"
+          git commit -m "${commit_message}"
+            
+          current_branch=${{ github.ref_name }}
+          git fetch origin $current_branch
+          git rebase origin/$current_branch
+          git push origin HEAD
-- 
cgit v1.2.3


From a36eadf554df292ceaecf549fc02f2e949521c66 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Sun, 16 Jul 2023 22:52:31 +0000
Subject: Auto-GPT-20230716225231

---
 .../Auto-GPT/reports/file1_07-16-22-52.json        | 179 +++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json

diff --git a/benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json b/benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json
new file mode 100644
index 000000000..c13e4f59a
--- /dev/null
+++ b/benchmark_runs/Auto-GPT/reports/file1_07-16-22-52.json
@@ -0,0 +1,179 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-16-22:52",
+    "metrics": {
+        "run_time": "14.51 seconds",
+        "highest_difficulty": ": 0"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 40.0,
+                "run_time": "14.286 seconds"
+            }
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.002 seconds"
+            }
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace",
+        "entry_path": "agbenchmark.benchmarks"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 5c7acbc71986d164e377740cbef7f8bf26e160e3 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Sun, 16 Jul 2023 22:59:08 +0000
Subject: gpt-engineer-20230716225908

---
 .../gpt-engineer/reports/file1_07-16-22-51.json    | 175 +++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json

diff --git a/benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json b/benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json
new file mode 100644
index 000000000..acfd01457
--- /dev/null
+++ b/benchmark_runs/gpt-engineer/reports/file1_07-16-22-51.json
@@ -0,0 +1,175 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-16-22:59",
+    "metrics": {
+        "run_time": "449.82 seconds",
+        "highest_difficulty": "novice: 3"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 50.0,
+                "run_time": "62.5 seconds"
+            }
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "70.822 seconds"
+            }
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "68.908 seconds"
+            }
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
+                "success_%": 75.0,
+                "run_time": "60.495 seconds"
+            }
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "69.361 seconds"
+            }
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1",
+            "is_regression": true,
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "67.503 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0, 0.0]",
+                "success_%": 75.0,
+                "run_time": "50.064 seconds"
+            }
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]",
+                "success_%": 75.0,
+                "run_time": "0.001 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace",
+        "entry_path": "agbenchmark.benchmarks"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From ce4cefe7e7211025994a4eab84c3a96209e705cb Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Sun, 16 Jul 2023 21:24:06 -0400
Subject: Dynamic home path for runs (#119)

---
 .env.example                                       |   2 +-
 agbenchmark/README.md                              |  72 ++--------
 agbenchmark/agent_interface.py                     |   9 +-
 agbenchmark/config.json                            |   3 +-
 agbenchmark/internal_info.json                     |  12 +-
 agbenchmark/reports/file1_07-14-18-54.json         | 147 ---------------------
 .../reports/mini-agi/file1_07-16-13-07.json        |  23 ++++
 agbenchmark/start_benchmark.py                     |  20 +--
 agbenchmark/utils.py                               |  81 ++++++++++--
 agent/Auto-GPT                                     |   2 +-
 agent/SuperAGI                                     |   2 +-
 agent/gpt-engineer                                 |   2 +-
 agent/mini-agi                                     |   2 +-
 agent/smol-developer                               |   2 +-
 14 files changed, 135 insertions(+), 244 deletions(-)
 delete mode 100644 agbenchmark/reports/file1_07-14-18-54.json
 create mode 100644 agbenchmark/reports/mini-agi/file1_07-16-13-07.json

diff --git a/.env.example b/.env.example
index e50ed58a5..197810bbb 100644
--- a/.env.example
+++ b/.env.example
@@ -1,3 +1,3 @@
 AGENT_NAME=mini-agi
-ENVIRONMENT=local
+HOME_ENV=
 MOCK_TEST=False
\ No newline at end of file
diff --git a/agbenchmark/README.md b/agbenchmark/README.md
index 42e2bd4dd..c814e6cff 100644
--- a/agbenchmark/README.md
+++ b/agbenchmark/README.md
@@ -40,45 +40,6 @@ Let people know what beautiful code you write does, document everything well
 
 Share your progress :)
 
-### Pytest
-
-an example of a test is below, use it as a template and change the class name, the .json name, what the test depends on and it's name, and the scoring logic
-
-```python
-import pytest
-from agbenchmark.tests.basic_abilities.BasicChallenge import BasicChallenge
-import os
-
-
-class TestWriteFile(BasicChallenge):
-    """Testing if LLM can write to a file"""
-
-    def test_method(self, config):
-        # implement scoring logic by looking at workspace
-```
-
-All challenges will inherit from parent class which has the mark and any specific methods for their category
-
-```python
-@pytest.mark.basic
-class BasicChallenge(Challenge):
-    pass
-```
-
-Add the below to create a file in the workspace prior to running a challenge. Only use when a file is needed to be created in the workspace prior to a test, such as with the read_file_test.
-
-```python
-@pytest.fixture(
-        scope="module", autouse=True
-    )  # this is specific to setting up a file for the test, not all tests have this
-    def setup_module(self, workspace):
-        Challenge.write_to_file(
-            workspace, self.data.ground.files[0], "this is how we're doing"
-        )
-```
-
-#### The main Challenge class has all the parametrization and loading logic so that all tests can inherit from it. It lives within [this file](https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/blob/master/agbenchmark/Challenge.py)
-
 ## Workspace
 
 If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-agi it is at `C:/Users/<name>/miniagi` - it will be automitcally set on config
@@ -87,29 +48,7 @@ If `--mock` flag is used it is at `agbenchmark/workspace`. Otherwise for mini-ag
 
 Manually created, existing challenges within Auto-Gpt, https://osu-nlp-group.github.io/Mind2Web/
 
-## Repo
-
-```
-|-- auto-gpt-benchmarks/ **main project directory**
-| |-- metrics.py **combining scores, metrics, final evaluation**
-| |-- start_benchmark.py **entry point from cli**
-| |-- conftest.py **config, workspace creation + teardown, regression tesst markers, parameterization**
-| |-- Challenge.py **easy challenge creation class**
-| |-- config.json **workspace folder**
-| |-- challenges/ **challenges across different domains**
-| | |-- adaptability/
-| | |-- basic_abilities/
-| | |-- code/
-| | |-- memory/
-| | |-- retrieval/
-| | |-- web_navigation/
-| | |-- writing/
-| |-- tests/
-| | |-- basic_abilities/ **every llm should pass these challenges**
-| | |-- regression/ **challenges that already passed**
-```
-
-## How to add new agents to agbenchmark ?
+## How do I add new agents to agbenchmark ?
 
 Example with smol developer.
 
@@ -120,3 +59,12 @@ https://github.com/smol-ai/developer/pull/114/files
 2- Create the submodule and the github workflow by following the same pattern as this example:
 
 https://github.com/Significant-Gravitas/Auto-GPT-Benchmarks/pull/48/files
+
+## How do I run agent in different environments?
+
+**To just use as the benchmark for your agent**. `pip install` the package and run `agbenchmark start`
+
+**For internal Auto-GPT ci runs**, specify the `AGENT_NAME` you want you use and set the `HOME_ENV`.
+Ex. `HOME_ENV=ci AGENT_NAME=mini-agi`
+
+**To develop agent alongside benchmark**, you can specify the `AGENT_NAME` you want you use and add as a submodule to the repo
diff --git a/agbenchmark/agent_interface.py b/agbenchmark/agent_interface.py
index 897f4f8cf..ff5bc8909 100644
--- a/agbenchmark/agent_interface.py
+++ b/agbenchmark/agent_interface.py
@@ -7,7 +7,7 @@ from typing import Any, Dict
 
 from dotenv import load_dotenv
 
-from agbenchmark.start_benchmark import CURRENT_DIRECTORY
+from agbenchmark.start_benchmark import CURRENT_DIRECTORY, HOME_DIRECTORY
 
 load_dotenv()
 
@@ -25,13 +25,16 @@ def run_agent(
             config["workspace"], "artifacts_out", challenge_location
         )
     else:
-        print(f"Running Python function '{config['entry_path']}' with timeout {cutoff}")
-        command = [sys.executable, "-m", config["entry_path"], str(task)]
+        entry_path = "agbenchmark.benchmarks"
+
+        print(f"Running Python function '{entry_path}' with timeout {cutoff}")
+        command = [sys.executable, "-m", entry_path, str(task)]
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             universal_newlines=True,
+            cwd=HOME_DIRECTORY,
         )
 
         start_time = time.time()
diff --git a/agbenchmark/config.json b/agbenchmark/config.json
index 820f133b1..3a03b7412 100644
--- a/agbenchmark/config.json
+++ b/agbenchmark/config.json
@@ -1,4 +1,3 @@
 {
-  "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-  "entry_path": "agbenchmark.benchmarks"
+  "workspace": "${os.path.join(Path.home(), 'miniagi')}"
 }
diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json
index 95a051d54..0e34ad7a3 100644
--- a/agbenchmark/internal_info.json
+++ b/agbenchmark/internal_info.json
@@ -15,6 +15,8 @@
         false
     ],
     "TestDebugSimpleTypoWithGuidance": [
+        false,
+        false,
         false,
         false,
         false
@@ -25,6 +27,7 @@
         false
     ],
     "TestReadFile": [
+        true,
         true,
         true,
         true
@@ -55,6 +58,7 @@
         true
     ],
     "TestSearch": [
+        true,
         true,
         true,
         true
@@ -68,6 +72,12 @@
         false,
         false,
         true,
-        false
+        false,
+        true,
+        false,
+        false,
+        false,
+        false,
+        true
     ]
 }
\ No newline at end of file
diff --git a/agbenchmark/reports/file1_07-14-18-54.json b/agbenchmark/reports/file1_07-14-18-54.json
deleted file mode 100644
index f81d19d3d..000000000
--- a/agbenchmark/reports/file1_07-14-18-54.json
+++ /dev/null
@@ -1,147 +0,0 @@
-{
-    "command": "agbenchmark start --mock",
-    "completion_time": "2023-07-14-18:54",
-    "metrics": {
-        "run_time": "0.97 seconds",
-        "highest_difficulty": "advanced: 5"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "non_mock_success_%": 75.0,
-                "run_time": "0.007 seconds"
-            }
-        },
-        "TestReadFile": {
-            "data_path": "agbenchmark/challenges/interface/read_file",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.008 seconds"
-            }
-        },
-        "TestSearch": {
-            "data_path": "agbenchmark/challenges/interface/search",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.007 seconds"
-            }
-        },
-        "TestDebugSimpleTypoWithGuidance": {
-            "data_path": "agbenchmark/challenges/code/d1",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "basic",
-                "success": false,
-                "fail_reason": "assert 1 in [0.0]",
-                "non_mock_success_%": 0.0,
-                "run_time": "0.448 seconds"
-            }
-        },
-        "TestBasicMemory": {
-            "data_path": "agbenchmark/challenges/memory/m1",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "basic",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.028 seconds"
-            }
-        },
-        "TestBasicRetrieval": {
-            "data_path": "agbenchmark/challenges/retrieval/r1",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "basic",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.014 seconds"
-            }
-        },
-        "TestDebugSimpleTypoWithoutGuidance": {
-            "data_path": "agbenchmark/challenges/code/d2",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "novice",
-                "success": false,
-                "fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "non_mock_success_%": 0.0,
-                "run_time": "0.001 seconds"
-            }
-        },
-        "TestCreateSimpleWebServer": {
-            "data_path": "agbenchmark/challenges/code/d3",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "advanced",
-                "success": false,
-                "fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "non_mock_success_%": 0.0,
-                "run_time": "0.002 seconds"
-            }
-        },
-        "TestRememberMultipleIds": {
-            "data_path": "agbenchmark/challenges/memory/m2",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "novice",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.023 seconds"
-            }
-        },
-        "TestRetrieval2": {
-            "data_path": "agbenchmark/challenges/retrieval/r2",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "novice",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.013 seconds"
-            }
-        },
-        "TestRememberMultipleIdsWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m3",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.03 seconds"
-            }
-        },
-        "TestRetrieval3": {
-            "data_path": "agbenchmark/challenges/retrieval/r3",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.016 seconds"
-            }
-        },
-        "TestRememberMultiplePhrasesWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m4",
-            "is_regression": true,
-            "metrics": {
-                "difficulty": "advanced",
-                "success": true,
-                "non_mock_success_%": 100.0,
-                "run_time": "0.034 seconds"
-            }
-        }
-    },
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-        "entry_path": "agbenchmark.benchmarks"
-    }
-}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
new file mode 100644
index 000000000..78bafc5f1
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
@@ -0,0 +1,23 @@
+{
+    "command": "agbenchmark start --test TestWriteFile",
+    "completion_time": "2023-07-16-13:07",
+    "metrics": {
+        "run_time": "13.91 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 30.0,
+                "run_time": "13.684 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index b31c9f5f9..ea17d1523 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -6,20 +6,17 @@ from typing import Any
 
 import click
 import pytest
-from dotenv import load_dotenv
 
-load_dotenv()
-
-from agbenchmark.utils import calculate_info_test_path
+from agbenchmark.utils import calculate_dynamic_paths
 
 CURRENT_DIRECTORY = Path(__file__).resolve().parent
 
-benchmarks_folder_path = Path(os.getcwd()) / "agbenchmark"
-
-CONFIG_PATH = str(benchmarks_folder_path / "config.json")
-REGRESSION_TESTS_PATH = str(benchmarks_folder_path / "regression_tests.json")
-
-INFO_TESTS_PATH = calculate_info_test_path(benchmarks_folder_path)
+(
+    HOME_DIRECTORY,
+    CONFIG_PATH,
+    REGRESSION_TESTS_PATH,
+    INFO_TESTS_PATH,
+) = calculate_dynamic_paths()
 
 
 @click.group()
@@ -48,9 +45,6 @@ def start(category: str, test: str, maintain: bool, improve: bool, mock: bool) -
         )
         return 1
 
-    if not benchmarks_folder_path.exists():
-        benchmarks_folder_path.mkdir(exist_ok=True)
-
     print(CONFIG_PATH, os.path.exists(CONFIG_PATH), os.stat(CONFIG_PATH).st_size)
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
         config = {}
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index 506c48847..c69509c70 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -6,25 +6,28 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any
 
-from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel
+from dotenv import load_dotenv
+
+load_dotenv()
 
+from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel
 
-def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
+AGENT_NAME = os.getenv("AGENT_NAME")
+HOME_ENV = os.getenv("HOME_ENV")
 
-    INFO_TESTS_PATH = (
-        benchmarks_folder_path / os.getenv("REPORT_LOCATION", ".") / "reports"
-    )
 
-    if not INFO_TESTS_PATH.exists():
-        INFO_TESTS_PATH.mkdir(parents=True, exist_ok=True)
+def calculate_info_test_path(reports_path: Path) -> str:
+    print("reports_pathreports_pathreports_pathreports_path", reports_path)
+    if not reports_path.exists():
+        reports_path.mkdir(parents=True, exist_ok=True)
         return str(
-            INFO_TESTS_PATH / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
+            reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
         )
     else:
-        json_files = glob.glob(str(INFO_TESTS_PATH / "*.json"))
+        json_files = glob.glob(str(reports_path / "*.json"))
         file_count = len(json_files)
         run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        new_file_path = INFO_TESTS_PATH / run_name
+        new_file_path = reports_path / run_name
         return str(new_file_path)
 
 
@@ -79,3 +82,61 @@ def get_highest_success_difficulty(data: dict) -> str:
         highest_difficulty_str = ""
 
     return f"{highest_difficulty_str}: {highest_difficulty_level}"
+
+
+def assign_paths(folder_path: Path) -> tuple[str, str, str]:
+    CONFIG_PATH = str(folder_path / "config.json")
+    REGRESSION_TESTS_PATH = str(folder_path / "regression_tests.json")
+
+    if HOME_ENV == "ci" and AGENT_NAME:
+        INFO_TESTS_PATH = calculate_info_test_path(
+            Path(os.getcwd()) / "agbenchmark" / "reports" / AGENT_NAME
+        )
+    else:
+        INFO_TESTS_PATH = calculate_info_test_path(folder_path / "reports")
+
+    return CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
+
+
+def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
+    # the default home is where you're running from
+    HOME_DIRECTORY = Path(os.getcwd())
+    benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
+
+    if AGENT_NAME and HOME_ENV == "ci":
+        if "/Auto-GPT-Benchmarks/agent" in str(HOME_DIRECTORY):
+            raise Exception("Must run from root of benchmark repo if HOME_ENV is ci")
+
+        # however if the env is local and the agent name is defined, we want to run that agent from the repo and then get the data in the internal agbenchmark directory
+        # this is for the ci/cd pipeline
+        benchmarks_folder_path = HOME_DIRECTORY / "agent" / AGENT_NAME / "agbenchmark"
+
+        CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
+            benchmarks_folder_path
+        )
+
+        # we want to run the agent from the submodule
+        HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
+
+    elif AGENT_NAME and not os.path.join("Auto-GPT-Benchmarks", "agent") in str(
+        HOME_DIRECTORY
+    ):
+        # if the agent name is defined but the run is not from the agent repo, then home is the agent repo
+        # used for development of both a benchmark and an agent
+        HOME_DIRECTORY = Path(os.getcwd()) / "agent" / AGENT_NAME
+        benchmarks_folder_path = HOME_DIRECTORY / "agbenchmark"
+
+        CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
+            benchmarks_folder_path
+        )
+    else:
+        # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
+        # used when its just a pip install
+        CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
+            benchmarks_folder_path
+        )
+
+    if not benchmarks_folder_path.exists():
+        benchmarks_folder_path.mkdir(exist_ok=True)
+
+    return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
diff --git a/agent/Auto-GPT b/agent/Auto-GPT
index 62ad7aa8c..114c484b5 160000
--- a/agent/Auto-GPT
+++ b/agent/Auto-GPT
@@ -1 +1 @@
-Subproject commit 62ad7aa8c9172f8b07cad939e215912088d6dc16
+Subproject commit 114c484b5cfe9a69a74ddcc00025d4a126f54120
diff --git a/agent/SuperAGI b/agent/SuperAGI
index f880b2464..ae3b89a32 160000
--- a/agent/SuperAGI
+++ b/agent/SuperAGI
@@ -1 +1 @@
-Subproject commit f880b24644fbd057d44e8b4390f3ac165c90249b
+Subproject commit ae3b89a325994c9dda74b5de39d6f7c48010270f
diff --git a/agent/gpt-engineer b/agent/gpt-engineer
index 9bb81041a..a1d9673f8 160000
--- a/agent/gpt-engineer
+++ b/agent/gpt-engineer
@@ -1 +1 @@
-Subproject commit 9bb81041ace9f09e8ea0e34e29f2e46bb9d46a36
+Subproject commit a1d9673f82ffce89a9b437e1b54d2e068160819d
diff --git a/agent/mini-agi b/agent/mini-agi
index 0f8eba95d..bb02bf0d5 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 0f8eba95d284a9a06801b40ae02c55f65f1a0ce9
+Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011
diff --git a/agent/smol-developer b/agent/smol-developer
index a23d01369..bec01917a 160000
--- a/agent/smol-developer
+++ b/agent/smol-developer
@@ -1 +1 @@
-Subproject commit a23d01369cea976e80b7889fdbf1096619471301
+Subproject commit bec01917a9fa6e7bd73e4d14b328dba468cae495
-- 
cgit v1.2.3


From dffc1dfd51ebe313d6b20e90a765d538a04f8e4b Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 17 Jul 2023 09:39:24 -0400
Subject: internal_info.json dynamic changes (#163)

---
 agbenchmark/ReportManager.py           | 16 +++++--
 agbenchmark/conftest.py                | 22 +++++++--
 agbenchmark/internal_info.json         | 83 ----------------------------------
 agbenchmark/reports/internal_info.json | 40 ++++++++++++++++
 agbenchmark/utils.py                   |  9 +++-
 5 files changed, 77 insertions(+), 93 deletions(-)
 delete mode 100644 agbenchmark/internal_info.json
 create mode 100644 agbenchmark/reports/internal_info.json

diff --git a/agbenchmark/ReportManager.py b/agbenchmark/ReportManager.py
index cae13595a..202574f9f 100644
--- a/agbenchmark/ReportManager.py
+++ b/agbenchmark/ReportManager.py
@@ -3,7 +3,7 @@ import os
 import sys
 import time
 from datetime import datetime
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 from agbenchmark.utils import get_highest_success_difficulty
 
@@ -37,8 +37,18 @@ class ReportManager:
         with open(self.filename, "w") as f:
             json.dump(self.tests, f, indent=4)
 
-    def add_test(self, test_name: str, test_details: dict | list) -> None:
-        self.tests[test_name] = test_details
+    def add_test(
+        self,
+        test_name: str,
+        test_details: dict | list,
+        agent_name: Optional[str] = None,
+    ) -> None:
+        if agent_name:
+            if agent_name not in self.tests:
+                self.tests[agent_name] = {}
+            self.tests[agent_name][test_name] = test_details
+        else:
+            self.tests[test_name] = test_details
 
         self.save()
 
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 245df485e..4a62af0b5 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -15,7 +15,7 @@ from agbenchmark.start_benchmark import (
     REGRESSION_TESTS_PATH,
     get_regression_data,
 )
-from agbenchmark.utils import calculate_success_percentage
+from agbenchmark.utils import AGENT_NAME, calculate_success_percentage
 
 
 def resolve_workspace(workspace: str) -> str:
@@ -128,9 +128,10 @@ regression_manager = ReportManager(REGRESSION_TESTS_PATH)
 # user facing reporting information
 info_manager = ReportManager(INFO_TESTS_PATH)
 
-INTERNAL_LOGS = Path(__file__).resolve().parent  # agbenchmark/conftest.py
+INTERNAL_LOGS_PATH = Path(__file__).resolve().parent / "reports"
+
 # internal db step in replacement track pass/fail rate
-internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json"))
+internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
 
 
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
@@ -171,11 +172,22 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
                 regression_manager.remove_test(test_name)
             info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
 
-        prev_test_results: list[bool] = internal_info.tests.get(test_name, [])
+        prev_test_results: list[bool]
+        agent_tests: dict[str, list[bool]] = {}
+
+        # if the structure is nested inside of the agent name
+        if AGENT_NAME:
+            agent_tests = internal_info.tests.get(AGENT_NAME, {})
+
+        if agent_tests:
+            prev_test_results = agent_tests.get(test_name, [])
+        else:
+            prev_test_results = internal_info.tests.get(test_name, [])
+
         if not mock:
             # only add if it's an actual test
             prev_test_results.append(info_details["metrics"]["success"])
-            internal_info.add_test(test_name, prev_test_results)
+            internal_info.add_test(test_name, prev_test_results, AGENT_NAME)
 
             # can calculate success rate regardless of mock
             info_details["metrics"]["success_%"] = calculate_success_percentage(
diff --git a/agbenchmark/internal_info.json b/agbenchmark/internal_info.json
deleted file mode 100644
index 0e34ad7a3..000000000
--- a/agbenchmark/internal_info.json
+++ /dev/null
@@ -1,83 +0,0 @@
-{
-    "TestBasicMemory": [
-        true,
-        true,
-        true
-    ],
-    "TestBasicRetrieval": [
-        true,
-        true,
-        true
-    ],
-    "TestCreateSimpleWebServer": [
-        false,
-        false,
-        false
-    ],
-    "TestDebugSimpleTypoWithGuidance": [
-        false,
-        false,
-        false,
-        false,
-        false
-    ],
-    "TestDebugSimpleTypoWithoutGuidance": [
-        false,
-        false,
-        false
-    ],
-    "TestReadFile": [
-        true,
-        true,
-        true,
-        true
-    ],
-    "TestRememberMultipleIds": [
-        true,
-        true,
-        true
-    ],
-    "TestRememberMultipleIdsWithNoise": [
-        true,
-        true,
-        true
-    ],
-    "TestRememberMultiplePhrasesWithNoise": [
-        true,
-        true,
-        true
-    ],
-    "TestRetrieval2": [
-        true,
-        true,
-        true
-    ],
-    "TestRetrieval3": [
-        true,
-        true,
-        true
-    ],
-    "TestSearch": [
-        true,
-        true,
-        true,
-        true
-    ],
-    "TestWriteFile": [
-        true,
-        true,
-        true,
-        false,
-        false,
-        false,
-        false,
-        true,
-        false,
-        true,
-        false,
-        false,
-        false,
-        false,
-        true
-    ]
-}
\ No newline at end of file
diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json
new file mode 100644
index 000000000..97b525c0f
--- /dev/null
+++ b/agbenchmark/reports/internal_info.json
@@ -0,0 +1,40 @@
+{
+  "mini-agi": {
+    "TestBasicMemory": [true, true, true],
+    "TestBasicRetrieval": [true, true, true],
+    "TestCreateSimpleWebServer": [false, false, false],
+    "TestDebugSimpleTypoWithGuidance": [
+      false,
+      false,
+      false,
+      false,
+      false,
+      false
+    ],
+    "TestDebugSimpleTypoWithoutGuidance": [false, false, false],
+    "TestReadFile": [true, true, true, true],
+    "TestRememberMultipleIds": [true, true, true],
+    "TestRememberMultipleIdsWithNoise": [true, true, true],
+    "TestRememberMultiplePhrasesWithNoise": [true, true, true],
+    "TestRetrieval2": [true, true, true],
+    "TestRetrieval3": [true, true, true],
+    "TestSearch": [true, true, true, true],
+    "TestWriteFile": [
+      true,
+      true,
+      true,
+      false,
+      false,
+      false,
+      false,
+      true,
+      false,
+      true,
+      false,
+      false,
+      false,
+      false,
+      true
+    ]
+  }
+}
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index c69509c70..e99a1fa05 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -17,7 +17,6 @@ HOME_ENV = os.getenv("HOME_ENV")
 
 
 def calculate_info_test_path(reports_path: Path) -> str:
-    print("reports_pathreports_pathreports_pathreports_path", reports_path)
     if not reports_path.exists():
         reports_path.mkdir(parents=True, exist_ok=True)
         return str(
@@ -129,6 +128,7 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
         CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH = assign_paths(
             benchmarks_folder_path
         )
+
     else:
         # otherwise the default is when home is an agent (running agbenchmark from agent/agent_repo)
         # used when its just a pip install
@@ -139,4 +139,9 @@ def calculate_dynamic_paths() -> tuple[Path, str, str, str]:
     if not benchmarks_folder_path.exists():
         benchmarks_folder_path.mkdir(exist_ok=True)
 
-    return HOME_DIRECTORY, CONFIG_PATH, REGRESSION_TESTS_PATH, INFO_TESTS_PATH
+    return (
+        HOME_DIRECTORY,
+        CONFIG_PATH,
+        REGRESSION_TESTS_PATH,
+        INFO_TESTS_PATH,
+    )
-- 
cgit v1.2.3


From 8aa6452cc4c76610597ae56f90d5af91170cd1eb Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 17 Jul 2023 11:24:16 -0400
Subject: file naming when --test (#164)

---
 agbenchmark/reports/internal_info.json             | 110 +++++++++++++--------
 .../reports/mini-agi/1.1_TestWriteFile.json        |  36 +++++++
 agbenchmark/reports/mini-agi/1_TestWriteFIle.json  |  27 +++++
 agbenchmark/reports/mini-agi/2.1_TestReadFile.json |  27 +++++
 agbenchmark/reports/mini-agi/2_TestReadFile.json   |  27 +++++
 agbenchmark/reports/mini-agi/3_TestSearch.json     |  27 +++++
 .../4.1_TestDebugSimpleTypoWithGuidance.json       |  28 ++++++
 .../4_TestDebugSimpleTypoWithGuidance.json         |  28 ++++++
 .../reports/mini-agi/file1_07-16-13-07.json        |  23 -----
 agbenchmark/utils.py                               |  52 ++++++++--
 agent/mini-agi                                     |   2 +-
 11 files changed, 315 insertions(+), 72 deletions(-)
 create mode 100644 agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
 create mode 100644 agbenchmark/reports/mini-agi/1_TestWriteFIle.json
 create mode 100644 agbenchmark/reports/mini-agi/2.1_TestReadFile.json
 create mode 100644 agbenchmark/reports/mini-agi/2_TestReadFile.json
 create mode 100644 agbenchmark/reports/mini-agi/3_TestSearch.json
 create mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
 delete mode 100644 agbenchmark/reports/mini-agi/file1_07-16-13-07.json

diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json
index 97b525c0f..0bfad744a 100644
--- a/agbenchmark/reports/internal_info.json
+++ b/agbenchmark/reports/internal_info.json
@@ -1,40 +1,72 @@
 {
-  "mini-agi": {
-    "TestBasicMemory": [true, true, true],
-    "TestBasicRetrieval": [true, true, true],
-    "TestCreateSimpleWebServer": [false, false, false],
-    "TestDebugSimpleTypoWithGuidance": [
-      false,
-      false,
-      false,
-      false,
-      false,
-      false
-    ],
-    "TestDebugSimpleTypoWithoutGuidance": [false, false, false],
-    "TestReadFile": [true, true, true, true],
-    "TestRememberMultipleIds": [true, true, true],
-    "TestRememberMultipleIdsWithNoise": [true, true, true],
-    "TestRememberMultiplePhrasesWithNoise": [true, true, true],
-    "TestRetrieval2": [true, true, true],
-    "TestRetrieval3": [true, true, true],
-    "TestSearch": [true, true, true, true],
-    "TestWriteFile": [
-      true,
-      true,
-      true,
-      false,
-      false,
-      false,
-      false,
-      true,
-      false,
-      true,
-      false,
-      false,
-      false,
-      false,
-      true
-    ]
-  }
-}
+    "mini-agi": {
+        "TestBasicMemory": [
+            true,
+            true,
+            true
+        ],
+        "TestBasicRetrieval": [
+            true,
+            true,
+            true
+        ],
+        "TestCreateSimpleWebServer": [
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithGuidance": [
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithoutGuidance": [
+            false,
+            false,
+            false
+        ],
+        "TestReadFile": [
+            true,
+            true,
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultipleIds": [
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultipleIdsWithNoise": [
+            true,
+            true,
+            true
+        ],
+        "TestRememberMultiplePhrasesWithNoise": [
+            true,
+            true,
+            true
+        ],
+        "TestRetrieval2": [
+            true,
+            true,
+            true
+        ],
+        "TestRetrieval3": [
+            true,
+            true,
+            true
+        ],
+        "TestSearch": [
+            true,
+            true,
+            true,
+            true
+        ],
+        "TestWriteFile": [
+            true,
+            true,
+            true
+        ]
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
new file mode 100644
index 000000000..637c2d5c5
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
@@ -0,0 +1,36 @@
+{
+    "TestWriteFile": {
+        "data_path": "agbenchmark/challenges/interface/write_file",
+        "is_regression": true,
+        "metrics": {
+            "difficulty": "interface",
+            "success": true,
+            "non_mock_success_%": 100.0,
+            "run_time": "0.009 seconds"
+        }
+    },
+    "additional": {
+        "model": "gpt-3.5-turbo"
+    },
+    "command": "agbenchmark start --test TestWriteFile",
+    "completion_time": "2023-07-17-09:54",
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    },
+    "metrics": {
+        "run_time": "22.36 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 40.0,
+                "run_time": "22.169 seconds"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
new file mode 100644
index 000000000..e64783190
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestWriteFile",
+  "completion_time": "2023-07-15-22:13",
+  "metrics": {
+    "run_time": "12.4 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestWriteFile": {
+      "data_path": "agbenchmark/challenges/interface/write_file",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 50.0,
+        "run_time": "12.127 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
new file mode 100644
index 000000000..b5d73af99
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestReadFile",
+  "completion_time": "2023-07-17-10:12",
+  "metrics": {
+    "run_time": "65.27 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestReadFile": {
+      "data_path": "agbenchmark/challenges/interface/read_file",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "65.074 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4",
+    "reached_termination_time": true
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json
new file mode 100644
index 000000000..869eaaac1
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestReadFile",
+  "completion_time": "2023-07-15-22:13",
+  "metrics": {
+    "run_time": "31.2 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestReadFile": {
+      "data_path": "agbenchmark/challenges/interface/read_file",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "30.903 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json
new file mode 100644
index 000000000..d9d05db4a
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/3_TestSearch.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestSearch",
+  "completion_time": "2023-07-15-22:14",
+  "metrics": {
+    "run_time": "16.88 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestSearch": {
+      "data_path": "agbenchmark/challenges/interface/search",
+      "is_regression": true,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "16.572 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..d72d599d8
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-15-22:16",
+  "metrics": {
+    "run_time": "45.92 seconds",
+    "highest_difficulty": ": 0"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "45.599 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..7985a7843
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-15-22:15",
+  "metrics": {
+    "run_time": "32.99 seconds",
+    "highest_difficulty": ": 0"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "32.582 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
+    "entry_path": "agbenchmark.benchmarks"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json b/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
deleted file mode 100644
index 78bafc5f1..000000000
--- a/agbenchmark/reports/mini-agi/file1_07-16-13-07.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-    "command": "agbenchmark start --test TestWriteFile",
-    "completion_time": "2023-07-16-13:07",
-    "metrics": {
-        "run_time": "13.91 seconds",
-        "highest_difficulty": "interface: 1"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 30.0,
-                "run_time": "13.684 seconds"
-            }
-        }
-    },
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
-    }
-}
\ No newline at end of file
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index e99a1fa05..5f1bb30da 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -1,7 +1,9 @@
 # radio charts, logs, helper functions for tests, anything else relevant.
 import glob
+import math
 import os
 import re
+import sys
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@@ -17,17 +19,49 @@ HOME_ENV = os.getenv("HOME_ENV")
 
 
 def calculate_info_test_path(reports_path: Path) -> str:
+    command = sys.argv
+
     if not reports_path.exists():
         reports_path.mkdir(parents=True, exist_ok=True)
-        return str(
-            reports_path / f"file1_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        )
-    else:
-        json_files = glob.glob(str(reports_path / "*.json"))
-        file_count = len(json_files)
-        run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
-        new_file_path = reports_path / run_name
-        return str(new_file_path)
+
+    json_files = glob.glob(str(reports_path / "*.json"))
+
+    # Default naming scheme
+    file_count = len(json_files)
+    run_name = f"file{file_count + 1}_{datetime.now().strftime('%m-%d-%H-%M')}.json"
+
+    # # If "--test" is in command
+    if "--test" in command:
+        test_index = command.index("--test")
+        try:
+            test_arg = command[test_index + 1]  # Argument after --test
+        except IndexError:
+            raise ValueError("Expected an argument after --test")
+
+        # Get all files that include the string that is the argument after --test
+        related_files = [f for f in json_files if test_arg in f]
+        related_file_count = len(related_files)
+
+        # Determine the prefix based on the existing files
+        if related_file_count == 0:
+            # Try to find the highest prefix number among all files, then increment it
+            all_prefix_numbers = []
+            for f in json_files:
+                number = float(Path(f).stem.split("_")[0])
+                all_prefix_numbers.append(math.floor(number))
+
+            max_prefix = max(all_prefix_numbers, default=0)
+            print("HEY WE ARE HERE BIG DAWG", max_prefix)
+            run_name = f"{max_prefix + 1}_{test_arg}.json"
+        else:
+            # Take the number from before the _ and add the .{number}
+            prefix_str = Path(related_files[0]).stem.rsplit("_", 1)[0].split(".")[0]
+            prefix = math.floor(float(prefix_str))
+            run_name = f"{prefix}.{related_file_count}_{test_arg}.json"
+
+    print("run_namerun_namerun_name", run_name)
+    new_file_path = reports_path / run_name
+    return str(new_file_path)
 
 
 def replace_backslash(value: Any) -> Any:
diff --git a/agent/mini-agi b/agent/mini-agi
index bb02bf0d5..0a9fcd8c3 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit bb02bf0d5cdbf045ff145271b78e4b4ee7225011
+Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d
-- 
cgit v1.2.3


From 515742ee61387593e0c6b21b15e92e35ead78a09 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <github@pwuts.nl>
Date: Mon, 17 Jul 2023 19:11:55 +0200
Subject: Fix loading the plugins config (#5000)

---
 autogpt/config/config.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/autogpt/config/config.py b/autogpt/config/config.py
index cb3f26d3e..b41ff68a1 100644
--- a/autogpt/config/config.py
+++ b/autogpt/config/config.py
@@ -290,11 +290,6 @@ class ConfigBuilder(Configurable[Config]):
 
         config_dict["plugins_allowlist"] = _safe_split(os.getenv("ALLOWLISTED_PLUGINS"))
         config_dict["plugins_denylist"] = _safe_split(os.getenv("DENYLISTED_PLUGINS"))
-        config_dict["plugins_config"] = PluginsConfig.load_config(
-            config_dict["plugins_config_file"],
-            config_dict["plugins_denylist"],
-            config_dict["plugins_allowlist"],
-        )
 
         with contextlib.suppress(TypeError):
             config_dict["image_size"] = int(os.getenv("IMAGE_SIZE"))
@@ -318,7 +313,17 @@ class ConfigBuilder(Configurable[Config]):
             k: v for k, v in config_dict.items() if v is not None
         }
 
-        return cls.build_agent_configuration(config_dict_without_none_values)
+        config = cls.build_agent_configuration(config_dict_without_none_values)
+
+        # Set secondary config variables (that depend on other config variables)
+
+        config.plugins_config = PluginsConfig.load_config(
+            config.plugins_config_file,
+            config.plugins_denylist,
+            config.plugins_allowlist,
+        )
+
+        return config
 
     @classmethod
     def load_azure_config(cls, config_file: str = AZURE_CONFIG_FILE) -> Dict[str, str]:
-- 
cgit v1.2.3


From a758acef2cf12b206d7172b47880dd876f8ad4bc Mon Sep 17 00:00:00 2001
From: Sohrab Saran <sohrabsaran@gmail.com>
Date: Mon, 17 Jul 2023 23:54:47 +0530
Subject: Fix `execute_python_file` workspace mount & Windows path formatting
 (#4996)

* fix for #4975

* Add TODO based on code comment.

* Use builtin `Path.as_posix()`

* Remove TODO

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
---
 autogpt/commands/execute_code.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/autogpt/commands/execute_code.py b/autogpt/commands/execute_code.py
index 2403b2ba5..fb4cb70ea 100644
--- a/autogpt/commands/execute_code.py
+++ b/autogpt/commands/execute_code.py
@@ -145,11 +145,14 @@ def execute_python_file(filename: str, agent: Agent) -> str:
         logger.debug(f"Running {file_path} in a {image_name} container...")
         container: DockerContainer = client.containers.run(
             image_name,
-            ["python", str(file_path.relative_to(agent.workspace.root))],
+            [
+                "python",
+                file_path.relative_to(agent.workspace.root).as_posix(),
+            ],
             volumes={
                 agent.config.workspace_path: {
                     "bind": "/workspace",
-                    "mode": "ro",
+                    "mode": "rw",
                 }
             },
             working_dir="/workspace",
-- 
cgit v1.2.3


From d76317fbf38945df2aa65e6d1fc26acae3739ead Mon Sep 17 00:00:00 2001
From: Luke <2609441+lc0rp@users.noreply.github.com>
Date: Mon, 17 Jul 2023 20:11:30 -0400
Subject: Update BULLETIN.md and version numbers (#5002)

Co-authored-by: lc0rp <2609411+lc0rp@users.noreply.github.com>
---
 BULLETIN.md    | 29 +++++++++++++----------------
 pyproject.toml |  2 +-
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/BULLETIN.md b/BULLETIN.md
index 117a436a8..a857a7ce1 100644
--- a/BULLETIN.md
+++ b/BULLETIN.md
@@ -4,26 +4,23 @@
 📖 *User Guide*: https://docs.agpt.co.
 👩 *Contributors Wiki*: https://github.com/Significant-Gravitas/Auto-GPT/wiki/Contributing.
 
-# v0.4.4 RELEASE HIGHLIGHTS! 🚀
+# v0.4.5 RELEASE HIGHLIGHTS! 🚀
 # -----------------------------
-## GPT-4 is back!
-Following OpenAI's recent GPT-4 GA announcement, the SMART_LLM .env setting 
-now defaults to GPT-4, and Auto-GPT will use GPT-4 by default in its main loop.
+This release includes under-the-hood improvements and bug fixes, such as more 
+accurate token counts for OpenAI functions, faster CI builds, improved plugin 
+handling, and refactoring of the Config class for better maintainability.
 
-### !! High Costs Warning !! 💰💀🚨
-GPT-4 costs ~20x more than GPT-3.5-turbo. 
-Please take note of this before using SMART_LLM. You can use `--gpt3only` 
-or `--gpt4only` to force the use of GPT-3.5-turbo or GPT-4, respectively, 
-at runtime.
+We have also released some documentation updates, including:
 
-## Re-arch v1 preview release!
-We've released a preview version of the re-arch code, under `autogpt/core`. 
-This is a major milestone for us, and we're excited to continue working on it. 
-We look forward to your feedback. Follow the process here: 
-https://github.com/Significant-Gravitas/Auto-GPT/issues/4770.
+- *How to share your system logs*
+  Visit [docs/share-your-logs.md] to learn how to how to share logs with us 
+  via a log analyzer graciously contributed by https://www.e2b.dev/
 
-## Other highlights
-Other fixes include plugins regressions, Azure config and security patches.
+- *Auto-GPT re-architecture documentation*
+  You can learn more about the inner-workings of the Auto-GPT re-architecture 
+  released last cycle, via these links:
+  * [autogpt/core/README.md]
+  * [autogpt/core/ARCHITECTURE_NOTES.md]
 
 Take a look at the Release Notes on Github for the full changelog! 
 https://github.com/Significant-Gravitas/Auto-GPT/releases.
diff --git a/pyproject.toml b/pyproject.toml
index 06b2f87f8..f16ee501f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agpt"
-version = "0.4.4"
+version = "0.4.5"
 authors = [
   { name="Torantulino", email="support@agpt.co" },
 ]
-- 
cgit v1.2.3


From 2d8fa5ca6f26f5c8b36d7d4e84187e9a0bea81dc Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Mon, 17 Jul 2023 17:15:10 -0700
Subject: Use report location (#165)

---
 .github/workflows/ci.yml | 8 ++++----
 agbenchmark/utils.py     | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a2224ea78..3b0dc50fe 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ on:
   push:
     branches: [master, ci-test*]
     paths-ignore:
-      - 'benchmark_runs/**'
+      - 'reports/**'
   pull_request:
     branches: [stable, master, release-*]
 
@@ -167,7 +167,7 @@ jobs:
           REQUESTS_CA_BUNDLE: /etc/ssl/certs/ca-certificates.crt
           HELICONE_CACHE_ENABLED: ${{ matrix.cache-enabled }}
           HELICONE_PROPERTY_AGENT: ${{ matrix.agent-name }}
-          REPORT_LOCATION:  ${{ matrix.cache-enabled == false && format('../../../benchmark_runs/{0}', matrix.agent-name) || '.' }}
+          REPORT_LOCATION: ${{ format('../../reports/{0}', matrix.agent-name) }}
 
 
       - name: Upload reports
@@ -175,7 +175,7 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: ${{ matrix.agent-name }}
-          path: benchmark_runs/${{ matrix.agent-name }}
+          path: reports/${{ matrix.agent-name }}
 
       - name: Authenticate and Push to Branch
         if: (success() || failure()) && (github.event_name != 'pull_request' && matrix.cache-enabled == false)
@@ -183,7 +183,7 @@ jobs:
           git config --global user.email "github-bot@agpt.co"
           git config --global user.name "Auto-GPT-Bot"
       
-          git add benchmark_runs/* || echo "nothing to commit"
+          git add reports/* || echo "nothing to commit"
           commit_message="${{ matrix.agent-name }}-$(date +'%Y%m%d%H%M%S')"
           git commit -m "${commit_message}"
             
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index 5f1bb30da..f1ed43639 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -19,6 +19,10 @@ HOME_ENV = os.getenv("HOME_ENV")
 
 
 def calculate_info_test_path(reports_path: Path) -> str:
+    report_location = os.getenv("REPORT_LOCATION", ".")
+    if report_location:
+        reports_path = Path(os.getcwd()) / report_location
+
     command = sys.argv
 
     if not reports_path.exists():
-- 
cgit v1.2.3


From ed5fd3416ac48b6b02a0497d983675c0486f70ee Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 00:17:59 +0000
Subject: smol-developer-20230718001759

---
 reports/smol-developer/file1_07-18-00-17.json | 176 ++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 reports/smol-developer/file1_07-18-00-17.json

diff --git a/reports/smol-developer/file1_07-18-00-17.json b/reports/smol-developer/file1_07-18-00-17.json
new file mode 100644
index 000000000..1842163b5
--- /dev/null
+++ b/reports/smol-developer/file1_07-18-00-17.json
@@ -0,0 +1,176 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-00:17",
+    "metrics": {
+        "run_time": "41.3 seconds",
+        "highest_difficulty": "novice: 3"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "5.554 seconds"
+            }
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "8.223 seconds"
+            }
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "16.099 seconds"
+            }
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "9.624 seconds"
+            }
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.625 seconds"
+            }
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From cf14609b518c489c4c62085c6a46f993c2268595 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 00:22:09 +0000
Subject: Auto-GPT-20230718002209

---
 reports/Auto-GPT/file1_07-18-00-18.json | 177 ++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 reports/Auto-GPT/file1_07-18-00-18.json

diff --git a/reports/Auto-GPT/file1_07-18-00-18.json b/reports/Auto-GPT/file1_07-18-00-18.json
new file mode 100644
index 000000000..aa693304d
--- /dev/null
+++ b/reports/Auto-GPT/file1_07-18-00-18.json
@@ -0,0 +1,177 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-00:22",
+    "metrics": {
+        "run_time": "239.83 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "35.666 seconds"
+            }
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "15.512 seconds"
+            }
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0, 0.0]",
+                "success_%": 0.0,
+                "run_time": "126.148 seconds"
+            }
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "62.169 seconds"
+            }
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From c7a5498f0f45c015e48a013cc172682b86e5b13a Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 00:25:27 +0000
Subject: gpt-engineer-20230718002527

---
 reports/gpt-engineer/file1_07-18-00-17.json | 173 ++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 reports/gpt-engineer/file1_07-18-00-17.json

diff --git a/reports/gpt-engineer/file1_07-18-00-17.json b/reports/gpt-engineer/file1_07-18-00-17.json
new file mode 100644
index 000000000..d7d3c1ee6
--- /dev/null
+++ b/reports/gpt-engineer/file1_07-18-00-17.json
@@ -0,0 +1,173 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-00:25",
+    "metrics": {
+        "run_time": "493.76 seconds",
+        "highest_difficulty": "intermediate: 4"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "66.807 seconds"
+            }
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "84.302 seconds"
+            }
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "74.761 seconds"
+            }
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
+                "success_%": 0.0,
+                "run_time": "45.324 seconds"
+            }
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "86.25 seconds"
+            }
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "76.728 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0, 0.0]",
+                "success_%": 0.0,
+                "run_time": "59.412 seconds"
+            }
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultipleIdsWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4",
+            "is_regression": false,
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIdsWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            }
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 12c5d545837b5256f34695820601f1797b489703 Mon Sep 17 00:00:00 2001
From: Silen Naihin <silen.naihin@gmail.com>
Date: Mon, 17 Jul 2023 22:41:58 -0400
Subject: Fixing memory challenges, naming, testing mini-agi, smooth retrieval
 scaling (#166)

---
 agbenchmark/challenge.py                           | 11 +--
 .../challenges/code/d1/artifacts_in/__init__.py    |  0
 .../challenges/code/d1/artifacts_in/code.py        | 13 ----
 .../challenges/code/d1/artifacts_in/test.py        | 31 ---------
 .../challenges/code/d1/artifacts_out/__init__.py   |  0
 .../challenges/code/d1/artifacts_out/code.py       | 12 ----
 .../challenges/code/d1/artifacts_out/test.py       | 31 ---------
 agbenchmark/challenges/code/d1/data.json           | 19 ------
 .../code/d1_debug/artifacts_in/__init__.py         |  0
 .../challenges/code/d1_debug/artifacts_in/code.py  | 13 ++++
 .../challenges/code/d1_debug/artifacts_in/test.py  | 31 +++++++++
 .../code/d1_debug/artifacts_out/__init__.py        |  0
 .../challenges/code/d1_debug/artifacts_out/code.py | 12 ++++
 .../challenges/code/d1_debug/artifacts_out/test.py | 31 +++++++++
 agbenchmark/challenges/code/d1_debug/data.json     | 19 ++++++
 .../challenges/code/d2/artifacts_in/__init__.py    |  0
 .../challenges/code/d2/artifacts_in/code.py        | 13 ----
 .../challenges/code/d2/artifacts_in/test.py        | 31 ---------
 .../challenges/code/d2/artifacts_out/__init__.py   |  0
 .../challenges/code/d2/artifacts_out/code.py       | 12 ----
 .../challenges/code/d2/artifacts_out/test.py       | 31 ---------
 agbenchmark/challenges/code/d2/data.json           | 19 ------
 .../code/d2_vague/artifacts_in/__init__.py         |  0
 .../challenges/code/d2_vague/artifacts_in/code.py  | 13 ++++
 .../challenges/code/d2_vague/artifacts_in/test.py  | 31 +++++++++
 .../code/d2_vague/artifacts_out/__init__.py        |  0
 .../challenges/code/d2_vague/artifacts_out/code.py | 12 ++++
 .../challenges/code/d2_vague/artifacts_out/test.py | 31 +++++++++
 agbenchmark/challenges/code/d2_vague/data.json     | 19 ++++++
 .../challenges/code/d3/custom_python/api_tests.py  | 41 -----------
 agbenchmark/challenges/code/d3/data.json           | 19 ------
 .../code/d3_two_sum/artifacts_out/__init__.py      |  0
 .../code/d3_two_sum/artifacts_out/code.py          | 12 ++++
 .../code/d3_two_sum/custom_python/test.py          | 31 +++++++++
 agbenchmark/challenges/code/d3_two_sum/data.json   | 19 ++++++
 .../challenges/code/d4/artifacts_out/__init__.py   |  0
 .../challenges/code/d4/artifacts_out/code.py       | 12 ----
 .../challenges/code/d4/custom_python/test.py       | 31 ---------
 agbenchmark/challenges/code/d4/data.json           | 19 ------
 .../code/d4_web_server/custom_python/api_tests.py  | 41 +++++++++++
 .../challenges/code/d4_web_server/data.json        | 19 ++++++
 .../challenges/code/d5/artifacts_out/__init__.py   |  0
 .../challenges/code/d5/artifacts_out/code.py       | 23 -------
 .../challenges/code/d5/custom_python/test.py       | 31 ---------
 agbenchmark/challenges/code/d5/data.json           | 19 ------
 .../code/d5_three_sum/artifacts_out/__init__.py    |  0
 .../code/d5_three_sum/artifacts_out/code.py        | 23 +++++++
 .../code/d5_three_sum/custom_python/test.py        | 31 +++++++++
 agbenchmark/challenges/code/d5_three_sum/data.json | 19 ++++++
 .../memory/m1/artifacts_in/instructions_1.txt      |  2 -
 .../memory/m1/artifacts_in/instructions_2.txt      |  1 -
 .../memory/m1/artifacts_in/instructions_3.txt      |  1 -
 .../memory/m1/artifacts_in/instructions_4.txt      |  1 -
 .../memory/m1/artifacts_in/instructions_5.txt      |  1 -
 .../memory/m1/artifacts_out/random_file.txt        |  1 -
 agbenchmark/challenges/memory/m1/data.json         | 19 ------
 .../memory/m1_id/artifacts_in/instructions_1.txt   |  2 +
 .../memory/m1_id/artifacts_in/instructions_2.txt   |  1 +
 .../memory/m1_id/artifacts_in/instructions_3.txt   |  1 +
 .../memory/m1_id/artifacts_in/instructions_4.txt   |  1 +
 .../memory/m1_id/artifacts_in/instructions_5.txt   |  1 +
 .../memory/m1_id/artifacts_out/result.txt          |  1 +
 agbenchmark/challenges/memory/m1_id/data.json      | 19 ++++++
 .../memory/m2/artifacts_in/instructions_1.txt      |  1 -
 .../memory/m2/artifacts_in/instructions_2.txt      |  1 -
 .../memory/m2/artifacts_in/instructions_3.txt      |  1 -
 .../memory/m2/artifacts_in/instructions_4.txt      |  1 -
 .../memory/m2/artifacts_in/instructions_5.txt      |  1 -
 .../memory/m2/artifacts_out/random_file.txt        |  4 --
 agbenchmark/challenges/memory/m2/data.json         | 19 ------
 .../m2_multiple/artifacts_in/instructions_1.txt    |  1 +
 .../m2_multiple/artifacts_in/instructions_2.txt    |  1 +
 .../m2_multiple/artifacts_in/instructions_3.txt    |  1 +
 .../m2_multiple/artifacts_in/instructions_4.txt    |  1 +
 .../m2_multiple/artifacts_in/instructions_5.txt    |  1 +
 .../memory/m2_multiple/artifacts_out/result.txt    |  4 ++
 .../challenges/memory/m2_multiple/data.json        | 19 ++++++
 .../memory/m3/artifacts_in/instructions_1.txt      |  5 --
 .../memory/m3/artifacts_in/instructions_2.txt      |  5 --
 .../memory/m3/artifacts_in/instructions_3.txt      |  5 --
 .../memory/m3/artifacts_in/instructions_4.txt      |  5 --
 .../memory/m3/artifacts_in/instructions_5.txt      |  5 --
 .../memory/m3/artifacts_out/random_file.txt        |  4 --
 agbenchmark/challenges/memory/m3/data.json         | 19 ------
 .../m3_noise/artifacts_in/instructions_1.txt       |  5 ++
 .../m3_noise/artifacts_in/instructions_2.txt       |  5 ++
 .../m3_noise/artifacts_in/instructions_3.txt       |  5 ++
 .../m3_noise/artifacts_in/instructions_4.txt       |  5 ++
 .../m3_noise/artifacts_in/instructions_5.txt       |  5 ++
 .../memory/m3_noise/artifacts_out/result.txt       |  4 ++
 agbenchmark/challenges/memory/m3_noise/data.json   | 19 ++++++
 .../memory/m4/artifacts_in/instructions_1.txt      |  5 --
 .../memory/m4/artifacts_in/instructions_2.txt      |  5 --
 .../memory/m4/artifacts_in/instructions_3.txt      |  5 --
 .../memory/m4/artifacts_in/instructions_4.txt      |  5 --
 .../memory/m4/artifacts_in/instructions_5.txt      |  5 --
 .../memory/m4/artifacts_out/random_file.txt        |  4 --
 agbenchmark/challenges/memory/m4/data.json         | 24 -------
 .../m4_phrases/artifacts_in/instructions_1.txt     |  5 ++
 .../m4_phrases/artifacts_in/instructions_2.txt     |  5 ++
 .../m4_phrases/artifacts_in/instructions_3.txt     |  5 ++
 .../m4_phrases/artifacts_in/instructions_4.txt     |  5 ++
 .../m4_phrases/artifacts_in/instructions_5.txt     |  5 ++
 .../memory/m4_phrases/artifacts_out/result.txt     |  4 ++
 agbenchmark/challenges/memory/m4_phrases/data.json | 24 +++++++
 .../retrieval/r1/artifacts_out/random_file.txt     |  1 -
 agbenchmark/challenges/retrieval/r1/data.json      | 19 ------
 .../r1_book_price/artifacts_out/random_file.txt    |  1 +
 .../challenges/retrieval/r1_book_price/data.json   | 19 ++++++
 .../r2.1_specific/artifacts_out/random_file.txt    |  1 +
 .../challenges/retrieval/r2.1_specific/data.json   | 19 ++++++
 .../r2.2_formatting/artifacts_out/random_file.txt  |  1 +
 .../challenges/retrieval/r2.2_formatting/data.json | 19 ++++++
 .../retrieval/r2/artifacts_out/random_file.txt     |  1 -
 agbenchmark/challenges/retrieval/r2/data.json      | 19 ------
 .../r2_tesla_revenue/artifacts_out/random_file.txt |  1 +
 .../retrieval/r2_tesla_revenue/data.json           | 19 ++++++
 agbenchmark/challenges/retrieval/r3/data.json      |  2 +-
 agbenchmark/conftest.py                            |  9 ++-
 agbenchmark/reports/internal_info.json             | 79 +++++++++++++---------
 .../reports/mini-agi/1.1_TestWriteFile.json        | 57 +++++++---------
 .../10.1_TestRememberMultipleWithNoise.json        | 30 ++++++++
 .../mini-agi/10_TestRememberMultipleWithNoise.json | 31 +++++++++
 .../11.1_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++
 .../11.2_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++
 .../11.3_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++
 .../11.4_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++
 .../11.5_TestRememberMultiplePhrasesWithNoise.json | 31 +++++++++
 .../11_TestRememberMultiplePhrasesWithNoise.json   | 31 +++++++++
 .../12.1_TestDebugSimpleTypoWithGuidance.json      | 28 ++++++++
 .../12.2_TestDebugSimpleTypoWithGuidance.json      | 28 ++++++++
 .../12.3_TestDebugSimpleTypoWithGuidance.json      | 28 ++++++++
 .../12_TestDebugSimpleTypoWithGuidance.json        | 31 +++++++++
 agbenchmark/reports/mini-agi/1_TestWriteFIle.json  |  4 +-
 agbenchmark/reports/mini-agi/2.1_TestReadFile.json |  4 +-
 agbenchmark/reports/mini-agi/2_TestReadFile.json   |  4 +-
 agbenchmark/reports/mini-agi/3.1_TestSearch.json   | 27 ++++++++
 agbenchmark/reports/mini-agi/3_TestSearch.json     |  4 +-
 .../reports/mini-agi/4.1_TestBasicRetrieval.json   | 27 ++++++++
 .../4.1_TestDebugSimpleTypoWithGuidance.json       | 28 --------
 .../reports/mini-agi/4_TestBasicRetrieval.json     | 27 ++++++++
 .../4_TestDebugSimpleTypoWithGuidance.json         | 28 --------
 .../reports/mini-agi/5.1_TestRetrieval2.0.json     | 30 ++++++++
 .../reports/mini-agi/5_TestRetrieval2.0.json       | 29 ++++++++
 .../reports/mini-agi/6.1_TestRetrieval2.1.json     | 30 ++++++++
 .../reports/mini-agi/6.2_TestRetrieval2.1.json     | 30 ++++++++
 .../reports/mini-agi/6.3_TestRetrieval2.1.json     | 30 ++++++++
 .../reports/mini-agi/6.4_TestRetrieval2.1.json     | 31 +++++++++
 .../reports/mini-agi/6_TestRetrieval2.1.json       | 30 ++++++++
 .../reports/mini-agi/7.1_TestRetrieval2.2.json     | 31 +++++++++
 .../reports/mini-agi/7_TestRetrieval2.2.json       | 30 ++++++++
 .../reports/mini-agi/8.1_TestBasicMemory.json      | 30 ++++++++
 .../reports/mini-agi/8_TestBasicMemory.json        | 31 +++++++++
 .../mini-agi/9.1_TestRememberMultipleIds.json      | 30 ++++++++
 .../mini-agi/9_TestRememberMultipleIds.json        | 31 +++++++++
 agbenchmark/utils.py                               |  5 +-
 agent/mini-agi                                     |  2 +-
 157 files changed, 1576 insertions(+), 741 deletions(-)
 delete mode 100644 agbenchmark/challenges/code/d1/artifacts_in/__init__.py
 delete mode 100644 agbenchmark/challenges/code/d1/artifacts_in/code.py
 delete mode 100644 agbenchmark/challenges/code/d1/artifacts_in/test.py
 delete mode 100644 agbenchmark/challenges/code/d1/artifacts_out/__init__.py
 delete mode 100644 agbenchmark/challenges/code/d1/artifacts_out/code.py
 delete mode 100644 agbenchmark/challenges/code/d1/artifacts_out/test.py
 delete mode 100644 agbenchmark/challenges/code/d1/data.json
 create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py
 create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_in/code.py
 create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_in/test.py
 create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d1_debug/artifacts_out/test.py
 create mode 100644 agbenchmark/challenges/code/d1_debug/data.json
 delete mode 100644 agbenchmark/challenges/code/d2/artifacts_in/__init__.py
 delete mode 100644 agbenchmark/challenges/code/d2/artifacts_in/code.py
 delete mode 100644 agbenchmark/challenges/code/d2/artifacts_in/test.py
 delete mode 100644 agbenchmark/challenges/code/d2/artifacts_out/__init__.py
 delete mode 100644 agbenchmark/challenges/code/d2/artifacts_out/code.py
 delete mode 100644 agbenchmark/challenges/code/d2/artifacts_out/test.py
 delete mode 100644 agbenchmark/challenges/code/d2/data.json
 create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py
 create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_in/code.py
 create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_in/test.py
 create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d2_vague/artifacts_out/test.py
 create mode 100644 agbenchmark/challenges/code/d2_vague/data.json
 delete mode 100644 agbenchmark/challenges/code/d3/custom_python/api_tests.py
 delete mode 100644 agbenchmark/challenges/code/d3/data.json
 create mode 100644 agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d3_two_sum/custom_python/test.py
 create mode 100644 agbenchmark/challenges/code/d3_two_sum/data.json
 delete mode 100644 agbenchmark/challenges/code/d4/artifacts_out/__init__.py
 delete mode 100644 agbenchmark/challenges/code/d4/artifacts_out/code.py
 delete mode 100644 agbenchmark/challenges/code/d4/custom_python/test.py
 delete mode 100644 agbenchmark/challenges/code/d4/data.json
 create mode 100644 agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py
 create mode 100644 agbenchmark/challenges/code/d4_web_server/data.json
 delete mode 100644 agbenchmark/challenges/code/d5/artifacts_out/__init__.py
 delete mode 100644 agbenchmark/challenges/code/d5/artifacts_out/code.py
 delete mode 100644 agbenchmark/challenges/code/d5/custom_python/test.py
 delete mode 100644 agbenchmark/challenges/code/d5/data.json
 create mode 100644 agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py
 create mode 100644 agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py
 create mode 100644 agbenchmark/challenges/code/d5_three_sum/custom_python/test.py
 create mode 100644 agbenchmark/challenges/code/d5_three_sum/data.json
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/challenges/memory/m1/data.json
 create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt
 create mode 100644 agbenchmark/challenges/memory/m1_id/data.json
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/challenges/memory/m2/data.json
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt
 create mode 100644 agbenchmark/challenges/memory/m2_multiple/data.json
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/challenges/memory/m3/data.json
 create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt
 create mode 100644 agbenchmark/challenges/memory/m3_noise/data.json
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/challenges/memory/m4/data.json
 create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt
 create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt
 create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt
 create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt
 create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt
 create mode 100644 agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt
 create mode 100644 agbenchmark/challenges/memory/m4_phrases/data.json
 delete mode 100644 agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/challenges/retrieval/r1/data.json
 create mode 100644 agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r1_book_price/data.json
 create mode 100644 agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r2.1_specific/data.json
 create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r2.2_formatting/data.json
 delete mode 100644 agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt
 delete mode 100644 agbenchmark/challenges/retrieval/r2/data.json
 create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt
 create mode 100644 agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json
 create mode 100644 agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json
 create mode 100644 agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/3.1_TestSearch.json
 create mode 100644 agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json
 delete mode 100644 agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json
 delete mode 100644 agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
 create mode 100644 agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json
 create mode 100644 agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json
 create mode 100644 agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json
 create mode 100644 agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json
 create mode 100644 agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json
 create mode 100644 agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json
 create mode 100644 agbenchmark/reports/mini-agi/8_TestBasicMemory.json
 create mode 100644 agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json
 create mode 100644 agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json

diff --git a/agbenchmark/challenge.py b/agbenchmark/challenge.py
index 4f24bb603..cdaebed4f 100644
--- a/agbenchmark/challenge.py
+++ b/agbenchmark/challenge.py
@@ -103,22 +103,25 @@ class Challenge(ABC):
         ]
 
     def scoring(self, content: str, ground: Ground) -> float:
+        print("Scoring content: ", content)
         if ground.should_contain:
             for should_contain_word in ground.should_contain:
                 if should_contain_word not in content:
+                    print(f"Word that should exist - {should_contain_word}: False")
                     return 0.0
                 else:
-                    print(
-                        f"Word that should exist: {should_contain_word} exists in the content"
-                    )
+                    print(f"Word that should exist - {should_contain_word}: True")
 
         if ground.should_not_contain:
             for should_not_contain_word in ground.should_not_contain:
                 if should_not_contain_word in content:
+                    print(
+                        f"Word that should not exist - {should_not_contain_word}: False"
+                    )
                     return 0.0
                 else:
                     print(
-                        f"Word that should not exist: {should_not_contain_word} does not exist in the content"
+                        f"Word that should not exist - {should_not_contain_word}: True"
                     )
 
         return 1.0
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1/artifacts_in/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/code.py b/agbenchmark/challenges/code/d1/artifacts_in/code.py
deleted file mode 100644
index df8120bfa..000000000
--- a/agbenchmark/challenges/code/d1/artifacts_in/code.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# mypy: ignore-errors
-from typing import List, Optional
-
-
-def two_sum(nums: List, target: int) -> Optional[List[int]]:
-    seen = {}
-    for i, num in enumerate(nums):
-        typo
-        complement = target - num
-        if complement in seen:
-            return [seen[complement], i]
-        seen[num] = i
-    return None
diff --git a/agbenchmark/challenges/code/d1/artifacts_in/test.py b/agbenchmark/challenges/code/d1/artifacts_in/test.py
deleted file mode 100644
index d85d13537..000000000
--- a/agbenchmark/challenges/code/d1/artifacts_in/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# mypy: ignore-errors
-from code import two_sum
-from typing import List
-
-
-def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
-    result = two_sum(nums, target)
-    print(result)
-    assert (
-        result == expected_result
-    ), f"AssertionError: Expected the output to be {expected_result}"
-
-
-if __name__ == "__main__":
-    # test the trivial case with the first two numbers
-    nums = [2, 7, 11, 15]
-    target = 9
-    expected_result = [0, 1]
-    test_two_sum(nums, target, expected_result)
-
-    # test for ability to use zero and the same number twice
-    nums = [2, 7, 0, 15, 12, 0]
-    target = 0
-    expected_result = [2, 5]
-    test_two_sum(nums, target, expected_result)
-
-    # test for first and last index usage and negative numbers
-    nums = [-6, 7, 11, 4]
-    target = -2
-    expected_result = [0, 3]
-    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1/artifacts_out/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/code.py b/agbenchmark/challenges/code/d1/artifacts_out/code.py
deleted file mode 100644
index de3d8c62c..000000000
--- a/agbenchmark/challenges/code/d1/artifacts_out/code.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# mypy: ignore-errors
-from typing import List, Optional
-
-
-def two_sum(nums: List, target: int) -> Optional[List[int]]:
-    seen = {}
-    for i, num in enumerate(nums):
-        complement = target - num
-        if complement in seen:
-            return [seen[complement], i]
-        seen[num] = i
-    return None
diff --git a/agbenchmark/challenges/code/d1/artifacts_out/test.py b/agbenchmark/challenges/code/d1/artifacts_out/test.py
deleted file mode 100644
index d85d13537..000000000
--- a/agbenchmark/challenges/code/d1/artifacts_out/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# mypy: ignore-errors
-from code import two_sum
-from typing import List
-
-
-def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
-    result = two_sum(nums, target)
-    print(result)
-    assert (
-        result == expected_result
-    ), f"AssertionError: Expected the output to be {expected_result}"
-
-
-if __name__ == "__main__":
-    # test the trivial case with the first two numbers
-    nums = [2, 7, 11, 15]
-    target = 9
-    expected_result = [0, 1]
-    test_two_sum(nums, target, expected_result)
-
-    # test for ability to use zero and the same number twice
-    nums = [2, 7, 0, 15, 12, 0]
-    target = 0
-    expected_result = [2, 5]
-    test_two_sum(nums, target, expected_result)
-
-    # test for first and last index usage and negative numbers
-    nums = [-6, 7, 11, 4]
-    target = -2
-    expected_result = [0, 3]
-    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1/data.json b/agbenchmark/challenges/code/d1/data.json
deleted file mode 100644
index d8e0280a4..000000000
--- a/agbenchmark/challenges/code/d1/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestDebugSimpleTypoWithGuidance",
-  "category": ["code", "iterate"],
-  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
-  "dependencies": ["TestReadFile", "TestWriteFile"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "[0, 1] [2, 5] [0, 3]",
-    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
-    "should_not_contain": [],
-    "files": ["test.py"],
-    "type": "execute_python_code"
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py
new file mode 100644
index 000000000..df8120bfa
--- /dev/null
+++ b/agbenchmark/challenges/code/d1_debug/artifacts_in/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d1_debug/artifacts_in/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py
new file mode 100644
index 000000000..de3d8c62c
--- /dev/null
+++ b/agbenchmark/challenges/code/d1_debug/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py b/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d1_debug/artifacts_out/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d1_debug/data.json b/agbenchmark/challenges/code/d1_debug/data.json
new file mode 100644
index 000000000..4e2798a37
--- /dev/null
+++ b/agbenchmark/challenges/code/d1_debug/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestDebugSimpleTypoWithGuidance",
+  "category": ["code", "iterate"],
+  "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+  "dependencies": ["TestReadFile", "TestWriteFile"],
+  "cutoff": 75,
+  "ground": {
+    "answer": "[0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2/artifacts_in/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/code.py b/agbenchmark/challenges/code/d2/artifacts_in/code.py
deleted file mode 100644
index df8120bfa..000000000
--- a/agbenchmark/challenges/code/d2/artifacts_in/code.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# mypy: ignore-errors
-from typing import List, Optional
-
-
-def two_sum(nums: List, target: int) -> Optional[List[int]]:
-    seen = {}
-    for i, num in enumerate(nums):
-        typo
-        complement = target - num
-        if complement in seen:
-            return [seen[complement], i]
-        seen[num] = i
-    return None
diff --git a/agbenchmark/challenges/code/d2/artifacts_in/test.py b/agbenchmark/challenges/code/d2/artifacts_in/test.py
deleted file mode 100644
index d85d13537..000000000
--- a/agbenchmark/challenges/code/d2/artifacts_in/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# mypy: ignore-errors
-from code import two_sum
-from typing import List
-
-
-def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
-    result = two_sum(nums, target)
-    print(result)
-    assert (
-        result == expected_result
-    ), f"AssertionError: Expected the output to be {expected_result}"
-
-
-if __name__ == "__main__":
-    # test the trivial case with the first two numbers
-    nums = [2, 7, 11, 15]
-    target = 9
-    expected_result = [0, 1]
-    test_two_sum(nums, target, expected_result)
-
-    # test for ability to use zero and the same number twice
-    nums = [2, 7, 0, 15, 12, 0]
-    target = 0
-    expected_result = [2, 5]
-    test_two_sum(nums, target, expected_result)
-
-    # test for first and last index usage and negative numbers
-    nums = [-6, 7, 11, 4]
-    target = -2
-    expected_result = [0, 3]
-    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2/artifacts_out/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/code.py b/agbenchmark/challenges/code/d2/artifacts_out/code.py
deleted file mode 100644
index de3d8c62c..000000000
--- a/agbenchmark/challenges/code/d2/artifacts_out/code.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# mypy: ignore-errors
-from typing import List, Optional
-
-
-def two_sum(nums: List, target: int) -> Optional[List[int]]:
-    seen = {}
-    for i, num in enumerate(nums):
-        complement = target - num
-        if complement in seen:
-            return [seen[complement], i]
-        seen[num] = i
-    return None
diff --git a/agbenchmark/challenges/code/d2/artifacts_out/test.py b/agbenchmark/challenges/code/d2/artifacts_out/test.py
deleted file mode 100644
index d85d13537..000000000
--- a/agbenchmark/challenges/code/d2/artifacts_out/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# mypy: ignore-errors
-from code import two_sum
-from typing import List
-
-
-def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
-    result = two_sum(nums, target)
-    print(result)
-    assert (
-        result == expected_result
-    ), f"AssertionError: Expected the output to be {expected_result}"
-
-
-if __name__ == "__main__":
-    # test the trivial case with the first two numbers
-    nums = [2, 7, 11, 15]
-    target = 9
-    expected_result = [0, 1]
-    test_two_sum(nums, target, expected_result)
-
-    # test for ability to use zero and the same number twice
-    nums = [2, 7, 0, 15, 12, 0]
-    target = 0
-    expected_result = [2, 5]
-    test_two_sum(nums, target, expected_result)
-
-    # test for first and last index usage and negative numbers
-    nums = [-6, 7, 11, 4]
-    target = -2
-    expected_result = [0, 3]
-    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2/data.json b/agbenchmark/challenges/code/d2/data.json
deleted file mode 100644
index de32ef9a7..000000000
--- a/agbenchmark/challenges/code/d2/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestDebugSimpleTypoWithoutGuidance",
-  "category": ["code", "iterate"],
-  "task": "Make test.py run without errors.",
-  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "[0, 1] [2, 5] [0, 3]",
-    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
-    "should_not_contain": [],
-    "files": ["test.py"],
-    "type": "execute_python_code"
-  },
-  "info": {
-    "difficulty": "novice",
-    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py
new file mode 100644
index 000000000..df8120bfa
--- /dev/null
+++ b/agbenchmark/challenges/code/d2_vague/artifacts_in/code.py
@@ -0,0 +1,13 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        typo
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d2_vague/artifacts_in/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py
new file mode 100644
index 000000000..de3d8c62c
--- /dev/null
+++ b/agbenchmark/challenges/code/d2_vague/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py b/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d2_vague/artifacts_out/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d2_vague/data.json b/agbenchmark/challenges/code/d2_vague/data.json
new file mode 100644
index 000000000..2b6c3526c
--- /dev/null
+++ b/agbenchmark/challenges/code/d2_vague/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestDebugSimpleTypoWithoutGuidance",
+  "category": ["code", "iterate"],
+  "task": "Make test.py run without errors.",
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "cutoff": 75,
+  "ground": {
+    "answer": "[0, 1] [2, 5] [0, 3]",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/code/d3/custom_python/api_tests.py b/agbenchmark/challenges/code/d3/custom_python/api_tests.py
deleted file mode 100644
index f01934ef8..000000000
--- a/agbenchmark/challenges/code/d3/custom_python/api_tests.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import os
-from typing import Any, Dict
-from unittest.mock import Mock, patch
-
-import requests
-
-
-def test_make_request_and_assert() -> None:
-    result = make_request_and_assert()
-    print(result)
-    expected_result = {"status": "OK"}
-    error_message = f"AssertionError: Expected the output to be {expected_result}"
-    print(error_message)
-    assert result == expected_result, error_message
-
-
-def make_assertion() -> None:
-    if os.environ.get("MOCK_TEST", "False").lower() == "true":
-        mock_response = Mock(requests.Response)
-        mock_response.status_code = 200
-        mock_response.json.return_value = {"status": "OK"}
-
-        with patch("requests.get", return_value=mock_response):
-            make_request_and_assert()
-    else:
-        make_request_and_assert()
-
-
-def make_request_and_assert() -> Dict[str, Any]:
-    response = requests.get("http://localhost:8079/health")
-    if response.status_code != 200:
-        raise AssertionError(
-            f"Expected status code 200, but got {response.status_code}"
-        )
-
-    return response.json()
-
-
-if __name__ == "__main__":
-    # test for the case when server is healthy
-    test_make_request_and_assert()
diff --git a/agbenchmark/challenges/code/d3/data.json b/agbenchmark/challenges/code/d3/data.json
deleted file mode 100644
index c5d111a4d..000000000
--- a/agbenchmark/challenges/code/d3/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestCreateSimpleWebServer",
-  "category": ["code"],
-  "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
-  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "GET localhost:8079/health responds with a 200 OK",
-    "should_contain": [],
-    "should_not_contain": ["AssertionError"],
-    "files": ["test.py"],
-    "type": "execute_python_code"
-  },
-  "info": {
-    "difficulty": "advanced",
-    "description": "Tests ability for the agent to build a simple web server locally",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py
new file mode 100644
index 000000000..de3d8c62c
--- /dev/null
+++ b/agbenchmark/challenges/code/d3_two_sum/artifacts_out/code.py
@@ -0,0 +1,12 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def two_sum(nums: List, target: int) -> Optional[List[int]]:
+    seen = {}
+    for i, num in enumerate(nums):
+        complement = target - num
+        if complement in seen:
+            return [seen[complement], i]
+        seen[num] = i
+    return None
diff --git a/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py b/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py
new file mode 100644
index 000000000..d85d13537
--- /dev/null
+++ b/agbenchmark/challenges/code/d3_two_sum/custom_python/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import two_sum
+from typing import List
+
+
+def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
+    result = two_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first two numbers
+    nums = [2, 7, 11, 15]
+    target = 9
+    expected_result = [0, 1]
+    test_two_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 0
+    expected_result = [2, 5]
+    test_two_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = -2
+    expected_result = [0, 3]
+    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d3_two_sum/data.json b/agbenchmark/challenges/code/d3_two_sum/data.json
new file mode 100644
index 000000000..6df083d40
--- /dev/null
+++ b/agbenchmark/challenges/code/d3_two_sum/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestBasicCodeGeneration",
+  "category": ["code"],
+  "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+  "dependencies": ["TestWriteFile"],
+  "cutoff": 90,
+  "ground": {
+    "answer": "The two_sum function coded properly.",
+    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "advanced",
+    "description": "Tests ability for the agent to create the two_sum function.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/__init__.py b/agbenchmark/challenges/code/d4/artifacts_out/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/code/d4/artifacts_out/code.py b/agbenchmark/challenges/code/d4/artifacts_out/code.py
deleted file mode 100644
index de3d8c62c..000000000
--- a/agbenchmark/challenges/code/d4/artifacts_out/code.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# mypy: ignore-errors
-from typing import List, Optional
-
-
-def two_sum(nums: List, target: int) -> Optional[List[int]]:
-    seen = {}
-    for i, num in enumerate(nums):
-        complement = target - num
-        if complement in seen:
-            return [seen[complement], i]
-        seen[num] = i
-    return None
diff --git a/agbenchmark/challenges/code/d4/custom_python/test.py b/agbenchmark/challenges/code/d4/custom_python/test.py
deleted file mode 100644
index d85d13537..000000000
--- a/agbenchmark/challenges/code/d4/custom_python/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# mypy: ignore-errors
-from code import two_sum
-from typing import List
-
-
-def test_two_sum(nums: List, target: int, expected_result: List[int]) -> None:
-    result = two_sum(nums, target)
-    print(result)
-    assert (
-        result == expected_result
-    ), f"AssertionError: Expected the output to be {expected_result}"
-
-
-if __name__ == "__main__":
-    # test the trivial case with the first two numbers
-    nums = [2, 7, 11, 15]
-    target = 9
-    expected_result = [0, 1]
-    test_two_sum(nums, target, expected_result)
-
-    # test for ability to use zero and the same number twice
-    nums = [2, 7, 0, 15, 12, 0]
-    target = 0
-    expected_result = [2, 5]
-    test_two_sum(nums, target, expected_result)
-
-    # test for first and last index usage and negative numbers
-    nums = [-6, 7, 11, 4]
-    target = -2
-    expected_result = [0, 3]
-    test_two_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d4/data.json b/agbenchmark/challenges/code/d4/data.json
deleted file mode 100644
index e8db918d2..000000000
--- a/agbenchmark/challenges/code/d4/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestBasicCodeGeneration",
-  "category": ["code"],
-  "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
-  "dependencies": ["TestWriteFile"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "The two_sum function coded properly.",
-    "should_contain": ["[0, 1]", "[2, 5]", "[0, 3]"],
-    "should_not_contain": [],
-    "files": ["test.py"],
-    "type": "execute_python_code"
-  },
-  "info": {
-    "difficulty": "novice",
-    "description": "Tests ability for the agent to create the two_sum function.",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py b/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py
new file mode 100644
index 000000000..f01934ef8
--- /dev/null
+++ b/agbenchmark/challenges/code/d4_web_server/custom_python/api_tests.py
@@ -0,0 +1,41 @@
+import os
+from typing import Any, Dict
+from unittest.mock import Mock, patch
+
+import requests
+
+
+def test_make_request_and_assert() -> None:
+    result = make_request_and_assert()
+    print(result)
+    expected_result = {"status": "OK"}
+    error_message = f"AssertionError: Expected the output to be {expected_result}"
+    print(error_message)
+    assert result == expected_result, error_message
+
+
+def make_assertion() -> None:
+    if os.environ.get("MOCK_TEST", "False").lower() == "true":
+        mock_response = Mock(requests.Response)
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"status": "OK"}
+
+        with patch("requests.get", return_value=mock_response):
+            make_request_and_assert()
+    else:
+        make_request_and_assert()
+
+
+def make_request_and_assert() -> Dict[str, Any]:
+    response = requests.get("http://localhost:8079/health")
+    if response.status_code != 200:
+        raise AssertionError(
+            f"Expected status code 200, but got {response.status_code}"
+        )
+
+    return response.json()
+
+
+if __name__ == "__main__":
+    # test for the case when server is healthy
+    test_make_request_and_assert()
diff --git a/agbenchmark/challenges/code/d4_web_server/data.json b/agbenchmark/challenges/code/d4_web_server/data.json
new file mode 100644
index 000000000..5c936e882
--- /dev/null
+++ b/agbenchmark/challenges/code/d4_web_server/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestCreateSimpleWebServer",
+  "category": ["code"],
+  "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+  "dependencies": ["TestDebugSimpleTypoWithGuidance"],
+  "cutoff": 90,
+  "ground": {
+    "answer": "GET localhost:8079/health responds with a 200 OK",
+    "should_contain": [],
+    "should_not_contain": ["AssertionError"],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "advanced",
+    "description": "Tests ability for the agent to build a simple web server locally",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/code/d5/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5/artifacts_out/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/agbenchmark/challenges/code/d5/artifacts_out/code.py b/agbenchmark/challenges/code/d5/artifacts_out/code.py
deleted file mode 100644
index 6056691da..000000000
--- a/agbenchmark/challenges/code/d5/artifacts_out/code.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# mypy: ignore-errors
-from typing import List, Optional
-
-
-def three_sum(nums: List[int], target: int) -> Optional[List[int]]:
-    nums_indices = [(num, index) for index, num in enumerate(nums)]
-    nums_indices.sort()
-    for i in range(len(nums_indices) - 2):
-        if i > 0 and nums_indices[i] == nums_indices[i - 1]:
-            continue
-        l, r = i + 1, len(nums_indices) - 1
-        while l < r:
-            three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0]
-            if three_sum < target:
-                l += 1
-            elif three_sum > target:
-                r -= 1
-            else:
-                indices = sorted(
-                    [nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]]
-                )
-                return indices
-    return None
diff --git a/agbenchmark/challenges/code/d5/custom_python/test.py b/agbenchmark/challenges/code/d5/custom_python/test.py
deleted file mode 100644
index 761b9f5c6..000000000
--- a/agbenchmark/challenges/code/d5/custom_python/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# mypy: ignore-errors
-from code import three_sum
-from typing import List
-
-
-def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
-    result = three_sum(nums, target)
-    print(result)
-    assert (
-        result == expected_result
-    ), f"AssertionError: Expected the output to be {expected_result}"
-
-
-if __name__ == "__main__":
-    # test the trivial case with the first three numbers
-    nums = [2, 7, 11, 15]
-    target = 20
-    expected_result = [0, 1, 2]
-    test_three_sum(nums, target, expected_result)
-
-    # test for ability to use zero and the same number twice
-    nums = [2, 7, 0, 15, 12, 0]
-    target = 2
-    expected_result = [0, 2, 5]
-    test_three_sum(nums, target, expected_result)
-
-    # test for first and last index usage and negative numbers
-    nums = [-6, 7, 11, 4]
-    target = 9
-    expected_result = [0, 2, 3]
-    test_three_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d5/data.json b/agbenchmark/challenges/code/d5/data.json
deleted file mode 100644
index 434b1312e..000000000
--- a/agbenchmark/challenges/code/d5/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestThreeSum",
-  "category": ["code", "iterate"],
-  "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
-  "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "The three_sum function coded properly.",
-    "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"],
-    "should_not_contain": [],
-    "files": ["test.py"],
-    "type": "execute_python_code"
-  },
-  "info": {
-    "difficulty": "intermediate",
-    "description": "Tests ability for the agent to create the three_sum function.",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py
new file mode 100644
index 000000000..6056691da
--- /dev/null
+++ b/agbenchmark/challenges/code/d5_three_sum/artifacts_out/code.py
@@ -0,0 +1,23 @@
+# mypy: ignore-errors
+from typing import List, Optional
+
+
+def three_sum(nums: List[int], target: int) -> Optional[List[int]]:
+    nums_indices = [(num, index) for index, num in enumerate(nums)]
+    nums_indices.sort()
+    for i in range(len(nums_indices) - 2):
+        if i > 0 and nums_indices[i] == nums_indices[i - 1]:
+            continue
+        l, r = i + 1, len(nums_indices) - 1
+        while l < r:
+            three_sum = nums_indices[i][0] + nums_indices[l][0] + nums_indices[r][0]
+            if three_sum < target:
+                l += 1
+            elif three_sum > target:
+                r -= 1
+            else:
+                indices = sorted(
+                    [nums_indices[i][1], nums_indices[l][1], nums_indices[r][1]]
+                )
+                return indices
+    return None
diff --git a/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py b/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py
new file mode 100644
index 000000000..761b9f5c6
--- /dev/null
+++ b/agbenchmark/challenges/code/d5_three_sum/custom_python/test.py
@@ -0,0 +1,31 @@
+# mypy: ignore-errors
+from code import three_sum
+from typing import List
+
+
+def test_three_sum(nums: List[int], target: int, expected_result: List[int]) -> None:
+    result = three_sum(nums, target)
+    print(result)
+    assert (
+        result == expected_result
+    ), f"AssertionError: Expected the output to be {expected_result}"
+
+
+if __name__ == "__main__":
+    # test the trivial case with the first three numbers
+    nums = [2, 7, 11, 15]
+    target = 20
+    expected_result = [0, 1, 2]
+    test_three_sum(nums, target, expected_result)
+
+    # test for ability to use zero and the same number twice
+    nums = [2, 7, 0, 15, 12, 0]
+    target = 2
+    expected_result = [0, 2, 5]
+    test_three_sum(nums, target, expected_result)
+
+    # test for first and last index usage and negative numbers
+    nums = [-6, 7, 11, 4]
+    target = 9
+    expected_result = [0, 2, 3]
+    test_three_sum(nums, target, expected_result)
diff --git a/agbenchmark/challenges/code/d5_three_sum/data.json b/agbenchmark/challenges/code/d5_three_sum/data.json
new file mode 100644
index 000000000..4df03b45b
--- /dev/null
+++ b/agbenchmark/challenges/code/d5_three_sum/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestThreeSum",
+  "category": ["code", "iterate"],
+  "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+  "dependencies": ["TestWriteFile", "TestBasicCodeGeneration"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "The three_sum function coded properly.",
+    "should_contain": ["[0, 1, 2]", "[0, 2, 5]", "[0, 2, 3]"],
+    "should_not_contain": [],
+    "files": ["test.py"],
+    "type": "execute_python_code"
+  },
+  "info": {
+    "difficulty": "advanced",
+    "description": "Tests ability for the agent to create the three_sum function.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
deleted file mode 100644
index da3babb1f..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_1.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-The id to remember is 2314
-Read the file instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
deleted file mode 100644
index 87c870b1a..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_2.txt
+++ /dev/null
@@ -1 +0,0 @@
-Read the file instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
deleted file mode 100644
index 987086208..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_3.txt
+++ /dev/null
@@ -1 +0,0 @@
-Read the file instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
deleted file mode 100644
index 78da710b8..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_4.txt
+++ /dev/null
@@ -1 +0,0 @@
-Read the file instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
deleted file mode 100644
index 8a2d35509..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts_in/instructions_5.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write the id previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt
deleted file mode 100644
index 86be9d159..000000000
--- a/agbenchmark/challenges/memory/m1/artifacts_out/random_file.txt
+++ /dev/null
@@ -1 +0,0 @@
-2314
diff --git a/agbenchmark/challenges/memory/m1/data.json b/agbenchmark/challenges/memory/m1/data.json
deleted file mode 100644
index 33c523244..000000000
--- a/agbenchmark/challenges/memory/m1/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestBasicMemory",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": ["TestReadFile", "TestWriteFile"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "2314",
-    "should_contain": ["2314"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..da3babb1f
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_1.txt
@@ -0,0 +1,2 @@
+The id to remember is 2314
+Read the file instructions_2.txt
diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..87c870b1a
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_2.txt
@@ -0,0 +1 @@
+Read the file instructions_3.txt
diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..987086208
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_3.txt
@@ -0,0 +1 @@
+Read the file instructions_4.txt
diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..78da710b8
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_4.txt
@@ -0,0 +1 @@
+Read the file instructions_5.txt
diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..73246e041
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/artifacts_in/instructions_5.txt
@@ -0,0 +1 @@
+Write the id previously mentioned to a file called result.txt.
diff --git a/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt b/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt
new file mode 100644
index 000000000..86be9d159
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/artifacts_out/result.txt
@@ -0,0 +1 @@
+2314
diff --git a/agbenchmark/challenges/memory/m1_id/data.json b/agbenchmark/challenges/memory/m1_id/data.json
new file mode 100644
index 000000000..186d25801
--- /dev/null
+++ b/agbenchmark/challenges/memory/m1_id/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestBasicMemory",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": ["TestReadFile", "TestWriteFile"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "2314",
+    "should_contain": ["2314"],
+    "should_not_contain": [],
+    "files": ["result.txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
deleted file mode 100644
index d304ce632..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_1.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 3145. Read the file instructions_2.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
deleted file mode 100644
index cfff66ba2..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_2.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 3791. Read the file instructions_3.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
deleted file mode 100644
index ad16b6fdc..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_3.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 9317. Read the file instructions_4.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
deleted file mode 100644
index 2394d64bc..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_4.txt
+++ /dev/null
@@ -1 +0,0 @@
-The id to remember is 9471. Read the file instructions_5.txt.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
deleted file mode 100644
index 2f60afa17..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts_in/instructions_5.txt
+++ /dev/null
@@ -1 +0,0 @@
-Write the ids previously mentioned to a .txt file.
diff --git a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt
deleted file mode 100644
index 7d48aaf18..000000000
--- a/agbenchmark/challenges/memory/m2/artifacts_out/random_file.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-3145
-3791
-9317
-9471
diff --git a/agbenchmark/challenges/memory/m2/data.json b/agbenchmark/challenges/memory/m2/data.json
deleted file mode 100644
index 840e8dc83..000000000
--- a/agbenchmark/challenges/memory/m2/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestRememberMultipleIds",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": ["TestBasicMemory"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "3145\n3791\n9317\n9471",
-    "should_contain": ["3145", "3791", "9317", "9471"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "info": {
-    "difficulty": "novice",
-    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..d304ce632
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_1.txt
@@ -0,0 +1 @@
+The id to remember is 3145. Read the file instructions_2.txt.
diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..cfff66ba2
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_2.txt
@@ -0,0 +1 @@
+The id to remember is 3791. Read the file instructions_3.txt.
diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..ad16b6fdc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_3.txt
@@ -0,0 +1 @@
+The id to remember is 9317. Read the file instructions_4.txt.
diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..2394d64bc
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_4.txt
@@ -0,0 +1 @@
+The id to remember is 9471. Read the file instructions_5.txt.
diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..30ac41026
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_in/instructions_5.txt
@@ -0,0 +1 @@
+Write the ids previously mentioned to a file called result.txt.
diff --git a/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt b/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt
new file mode 100644
index 000000000..7d48aaf18
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/artifacts_out/result.txt
@@ -0,0 +1,4 @@
+3145
+3791
+9317
+9471
diff --git a/agbenchmark/challenges/memory/m2_multiple/data.json b/agbenchmark/challenges/memory/m2_multiple/data.json
new file mode 100644
index 000000000..aac25b21e
--- /dev/null
+++ b/agbenchmark/challenges/memory/m2_multiple/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRememberMultipleIds",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": ["TestBasicMemory"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "3145\n3791\n9317\n9471",
+    "should_contain": ["3145", "3791", "9317", "9471"],
+    "should_not_contain": [],
+    "files": ["result.txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
deleted file mode 100644
index 8deee1b09..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_1.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
-
-The id to remember is 3145. Read the file instructions_2.txt.
-
-OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
deleted file mode 100644
index c9b5fc5f9..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
-
-The id to remember is 3791. Read the file instructions_3.txt.
-
-BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
deleted file mode 100644
index a564a7c91..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_3.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
-
-The id to remember is 9317. Read the file instructions_4.txt.
-
-8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
deleted file mode 100644
index 63fcd54fa..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_4.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
-
-The id to remember is 9471. Read the file instructions_5.txt.
-
-1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
deleted file mode 100644
index 99c9efa35..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts_in/instructions_5.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
-
-Write the ids previously mentioned to a .txt file.
-
-gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt
deleted file mode 100644
index 7d48aaf18..000000000
--- a/agbenchmark/challenges/memory/m3/artifacts_out/random_file.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-3145
-3791
-9317
-9471
diff --git a/agbenchmark/challenges/memory/m3/data.json b/agbenchmark/challenges/memory/m3/data.json
deleted file mode 100644
index 3af2fb3a5..000000000
--- a/agbenchmark/challenges/memory/m3/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestRememberMultipleIdsWithNoise",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": ["TestRememberMultipleIds"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "3145\n3791\n9317\n9471",
-    "should_contain": ["3145", "3791", "9317", "9471"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "info": {
-    "difficulty": "intermediate",
-    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..8deee1b09
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_1.txt
@@ -0,0 +1,5 @@
+xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
+
+The id to remember is 3145. Read the file instructions_2.txt.
+
+OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..c9b5fc5f9
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_2.txt
@@ -0,0 +1,5 @@
+2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
+
+The id to remember is 3791. Read the file instructions_3.txt.
+
+BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..a564a7c91
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_3.txt
@@ -0,0 +1,5 @@
+ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
+
+The id to remember is 9317. Read the file instructions_4.txt.
+
+8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..63fcd54fa
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_4.txt
@@ -0,0 +1,5 @@
+RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
+
+The id to remember is 9471. Read the file instructions_5.txt.
+
+1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..aa379ea51
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3_noise/artifacts_in/instructions_5.txt
@@ -0,0 +1,5 @@
+sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
+
+Write the ids previously mentioned to a file called result.txt.
+
+gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt b/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt
new file mode 100644
index 000000000..7d48aaf18
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3_noise/artifacts_out/result.txt
@@ -0,0 +1,4 @@
+3145
+3791
+9317
+9471
diff --git a/agbenchmark/challenges/memory/m3_noise/data.json b/agbenchmark/challenges/memory/m3_noise/data.json
new file mode 100644
index 000000000..8234e25c7
--- /dev/null
+++ b/agbenchmark/challenges/memory/m3_noise/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRememberMultipleWithNoise",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": ["TestRememberMultipleIds"],
+  "cutoff": 75,
+  "ground": {
+    "answer": "3145\n3791\n9317\n9471",
+    "should_contain": ["3145", "3791", "9317", "9471"],
+    "should_not_contain": [],
+    "files": ["result.txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "intermediate",
+    "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
deleted file mode 100644
index 1b1e0147c..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_1.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
-
-The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt.
-
-OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
deleted file mode 100644
index 92203f5c1..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
-
-The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt.
-
-BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
deleted file mode 100644
index d7f6f08fc..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_3.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
-
-The phrase to remember is 'My pet rock sings better than Beyoncé on Tuesdays'. Read the file instructions_4.txt.
-
-8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
deleted file mode 100644
index 6c0a13bae..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_4.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
-
-The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt.
-
-1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
deleted file mode 100644
index b4ef60bad..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts_in/instructions_5.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
-
-Write the phrases previously mentioned to a .txt file.
-
-gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt b/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
deleted file mode 100644
index 9b8405bf1..000000000
--- a/agbenchmark/challenges/memory/m4/artifacts_out/random_file.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-The purple elephant danced on a rainbow while eating a taco
-The sneaky toaster stole my socks and ran away to Hawaii
-My pet rock sings better than Beyoncé on Tuesdays
-The giant hamster rode a unicycle through the crowded mall
diff --git a/agbenchmark/challenges/memory/m4/data.json b/agbenchmark/challenges/memory/m4/data.json
deleted file mode 100644
index 17a078e1b..000000000
--- a/agbenchmark/challenges/memory/m4/data.json
+++ /dev/null
@@ -1,24 +0,0 @@
-{
-  "name": "TestRememberMultiplePhrasesWithNoise",
-  "category": ["memory"],
-  "task": "Follow the instructions in the instructions_1.txt file",
-  "dependencies": ["TestRememberMultipleIdsWithNoise"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
-    "should_contain": [
-      "The purple elephant danced on a rainbow while eating a taco",
-      "The sneaky toaster stole my socks and ran away to Hawaii",
-      "My pet rock sings better than Beyoncé on Tuesdays",
-      "The giant hamster rode a unicycle through the crowded mall"
-    ],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "info": {
-    "difficulty": "advanced",
-    "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt
new file mode 100644
index 000000000..1b1e0147c
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_1.txt
@@ -0,0 +1,5 @@
+xBd9cZTfaExYalwtUVR7m0pe3Nfaf5uBS4IxGFZPZcQjDf8Tfk2vdpqWI0ESBWCdVzsdlxMmUxq43INSz1iftsv6PTOMGQ88Cojwj5mQXp8XKZ6VJC893BDLVLLW00iQy8VerWjQay9rAJz8rYVZHa6dByYNWZNOgtjC7ejnKt0VYZtUFfRBZNWG2HNX3tgX8H2h4xeu7GIfm4wETGvzlZDANeqiY2hMCXEAsliVXXjgmvVeB05tjkS5uvq5uV2DnNyStimIVVdEMFI5Ft0qM82aMvJlUtVj6TJEmE0qPTqBXeHud72iRTcBa9okCzjYiAd6oSoJ8k9o6lmFTeq323ILYCGzsICjqoysuVonfHUDh1Ll2LTo4I2AygfPqCqvgQWq9wa8YfWKBlwPPVy2lymJRTd1mS7RUaiagoNn76ApJviCYh2fWEZcxULCKAbbn0E6vz1CBADSOEIVB14ZyyRfiDcXbgYYcnOShwMsg0vYcKDKfAHk
+
+The phrase to remember is 'The purple elephant danced on a rainbow while eating a taco'. Read the file instructions_2.txt.
+
+OueiZyOoM5eGI5VkTt6Ue1XboZ4jztAa5IGWqSbhIhLiI4X2nOmJw6tLBd3smZNwKQpq8NHxZSk76Xd82yGI3l9KhLARXRT37MRkY7WG8YQVJEurki86cIGuXthSWjq9dDKh6ILSq4DdWPIHfDeV12psErCcgEUxWtoU6bnSMnaoYteOkKWTAkXdC1t4j5p3rXbMv1j92nLmCmoslT2A9noQIODWLdudtCecTMmrbq85RLBt5WFLGMfWVsuSrSMGo5tiN7vC1siLfhlhco0q5QaMv0px6kVg44Wceg3UXIUoMxTNoh9G8uEABJhvsF2qzxkbHuhk6VRuydIWoGgfN01upk6BDfvooyAkdcTJG5jFlHOJixTe4ramT5uP54oZ0anJTB6w7hybN3o9vb4xLbAFQxCZIXZ9HXgeBnl1L8qIvQg9VsklntCMsu2cm5CgIryRBGPqnTAbrhmAmFOkNyLSVFfYmu2wtlMov2NIkYilT4Oa1Rkt
diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt
new file mode 100644
index 000000000..92203f5c1
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_2.txt
@@ -0,0 +1,5 @@
+2yXfw2vPZCZq4jGOTHF4NEUYLbAUBIcmkgLxG7qXnYLNsvvZDqAvBPw4OcOfleIWvS6S5GThSPcrSfX94yB1TT6SVHGqPkulJUk4W1wfIFRIiOSps6V8ulLyrmeZsEJ6l9B9Vrm4h6SZTQVP750TUfECOH4d5j5AtupugjqThyw3t6ZFYHr2eUYRiOiTlng2uvsoZiioBQlUitrjQ4mw8FRL3VaR2aAhHGwaNV0Q7XelFU50YQgcUYqfxHxmqCLqb7dtZ7WWwxrLcqwVbj4y1YteFzPZyU4TJKopMVizgWaam8tKa1hYAQHqEaiAAHigqvYhutPHarpzc4PP2RLE4AZCxRblSY40iXpxQ9waXsrUEZ51ZRFmvm5G17wuKghMcKea2jN2MIgvSxNBy7cszFyBTe6V6u6IMk1wVWa0YulPslLc0bXUVKqZ54b61lyBAKSoFbJVRFYB5XZBL5tp2efvTsEQ3JvFmSREEOhmawIriifCApy1
+
+The phrase to remember is 'The sneaky toaster stole my socks and ran away to Hawaii'. Read the file instructions_3.txt.
+
+BDLfeJBcfS4iqE9sNAm4ndZT2F1fsFYdXGRpRQ6xSXl014c9H7NeMbQCtFb7kRtVvzx9AItPj1uqtjA0R35N2Pj8FgxfSPDb8dlizLA6dbKY4JfCWmibzrBYoFzoxiPX57m3n8yLKHA0aejG38aMJ6XjR75kAjBW0Cw9d3Ny0MphakfW8KDZoMO3qwsPLLASYrz42K7JjThVGZvEXczRBY8la4NJPZpj91GmfsQuJezCvcI87gjfjtRDp1GECU9SmLSWBufjQWWlc4p6z5XtPPu0vqxRjoiFDFZvafU35KkEDcWAHv3KhR0Z20JD2qIrJ4CHntwGBAk61nMBpKhNx0t3ONK5X0WD7gNCdG64obji2ifsI8ZydLkROJkAJCpe4zRd04mkydCwKGJzmCGv0lu1KRn4QobFq7mEeuzD0xvvGtyiuiVXJSVqphf5ySmfjD4EvDCMRDNZx7c4pECUnLBPDlB8JwMyugfyD5mslte9YCG9kK6n
diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt
new file mode 100644
index 000000000..dc444c2ab
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_3.txt
@@ -0,0 +1,5 @@
+ccgfgLxgQxZ1F3USqH9STHNVYAV3AnxU97C77kgsSkx51Czq0cLp4ZmedPY3dVNUtm2kR9HDahe7Cp6zc3zvpK9RlRKQ1rugGDkNbgc4H57MYOQpDKzbikvHWXtxFixXqNiuRJW8h8uAAKv7qsStgJ2loyM9zKVicCdHerR2Q3kT0No2DOxHUv8T1d7rN3Kf0Sh6nz2LOSqvLjtzTAxpkM7ehUzFuOoXjQnuVUHqXTzL26ASeMd74N5DgO81LofDb6mcvrmVMIY8tfDBqHV83a8zn9HySeg5ZapnAVI1KOXRXyZEgbsVe1rEy8slCXqEuYCqn8IudJBnRDNKnZRdhwQ8tCpXQ1D3quhLEJxjhpRFiLl7lEZvtkunL2UlDkErcFjfKfVJLNy2loQsB9aKbR3NXBGbQNXR9QQ2xyGJPl8Wnw03sCRjubn83LpMoC2jirhJld5VFTffPY3jw0MyX3vUF8kfHRGowsrzzcdNuESiRf3iMcmx
+
+The phrase to remember is 'My pet rock sings better than Beyonce on Tuesdays'. Read the file instructions_4.txt.
+
+8YwXVp6zAoxCfth85iCMA2BrfDW8JjjzieFnuj7GOhW8BlbTsymfiA59qTBCuThFk0wKlT4MDskG5dU7dMHuEAKslHwcfZywvUZYFacVgSn3VfQeWBUgXE7rrrYRclqpjI3nqYuqfVzNHxn0iG13j6q5SAauFSaP2x0CWdDnEGqi9Dh7DcvjVIzlXn3GLa1BTd0qceGYCJRT7UM9YTHJYHo7Ipr2Hgreawb09Xx0KovgzBSgJYpqRNwVRFTfpgvZZIEA0QDPZE28n8KgtDPHHAIroZP6KijKCjyNLjYyM0D4AtiuLDon8bcaLRhzhC6plMvI7pkpEpsct3u9zg8C8V0pONWpWNNCompsuwl4TECiVtVygrYNcroROjNs1CGfliHCvGBbADSQORrFbhRyYEhIVNgXQsh4ilpcOYLAZoxsYlFtJw5Mf7y8ebhay1uJfNaolFWD63QID5UHBqzXZUDupjtUhD0N4XEtNeVyTqXD4S5SffdQ
diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt
new file mode 100644
index 000000000..6c0a13bae
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_4.txt
@@ -0,0 +1,5 @@
+RpsPog8gFZhpbMDHImq5OBbzE9DwrzKaqldc7UVs8FXohjoAaLb7sYW291NVDdUUk4AYwHum0VotgPPdQp3wxGza9Se6pex9G6IDd1jxPW5UTnik0Ygns0U5m5uCvuF140ozvPBj5blJ5wWIiQOXceyl7RsXIP82w1u0yTMCmAdefJISC9HAsJKg2QBbtCNoGIHEIuCZFSdskBe9XaPxPoA40AD4UXpwKIE3qEui39QyePL1czeqSGGmmRgziDiMhFiSwNarA0rwehZPW3QKpACjRu9wcVoTg7wxT4R8bziUzwVDeI5ZeqNqNrTUYiLpfUEbckcRQD3FUALmEtzIdSMFdD9z8CiSXVEAOpUbiqm9itNBnMVaZQ2eyKxet4EvZrfUbkhGntIxf8VOsgXlkzb0iXcyHu1PogCjOkSWodlPfN4mYm0P4EvUGogI3suX3tCXp5EJiZHSIoyApxme6Nd6NkdbIjcQOep42z6z23cO5uPv41cA
+
+The phrase to remember is 'The giant hamster rode a unicycle through the crowded mall'. Read the file instructions_5.txt.
+
+1d2q36DxTTRcg8ijubOcPntReHzsM94G2O0xmclXuqh8kqVaYcNdl7mmQWNSSh4OEF6BUuhVg0s58UFeJnLNOzYeC8T0fYhhrvkw8enra8JS2VeQCnTxdn4GUhJXu4V0UOhaeNnArmO03M8n6mQbV77VnH5RSVR4qVWnYmHGaswdHxYOZVnxFSyzzmhdX1dK5q9EgIwadglyBkpKu15ZaMIH2SOPjil8YjD4euRZc2EGZDcvCkJAJeLsrbTQLM82k2fKqRFyuNELCQp1DRz6gExByQ11SbHRqednp6ndj8msLC116PVMqp8pSDVZ1RrTFV9UOReeO9wnveJbLUEMh8c3wlg6yXn0XrbO7qabvdKXAjIIF8JN9HdLoFGdtDPLJbHDyJPqAYt9OIfyr8knQFLJZjKCBn8QrYy1jdFIHnENf1vZTI1bA4SUOG81xeRHGDVln9mcjdlBHdWRHX9TPbaokSjl9q7nDk6734KpPullyvm10l1M
diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt
new file mode 100644
index 000000000..2092b42a2
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_in/instructions_5.txt
@@ -0,0 +1,5 @@
+sUIic8IsnuubPzkKH7Q7eZ9d6ityYWhs5x9PBYpxt4q0BqVVJ63T0Ir0YXYbhTkxrUWnPYIGlAl8Qf96LPrqqdzJEgE42XDazV3MBdZKW8V1MGUb7PThgshpo7oPwj8CzmJII76VGILp8uOtDTiS9jGXg1b8bquMQUglx7i8R58m5b9uMtUhMyd45ETxlKBwY0Mmw9hm9EtiIZYfXuKlgcE7iTYip2FZVaTTo4WJyMD0Tqpf53ffrDWzPJUABk9b3PNkRtH1HQ4CQlKkwuNA51aWL3H0rcdHggTcOUDyQzUiWxebnPjLoDbXp4SDLeJHofCSzrUoCKTj1cz82Ey7nB3zt1jbqD3MClCQWunifkt2NHDQfMQMsDyhkzG9NCSHMKbGOBgIcxqx9HEDaYuOUCYDnj2iqXiRl0DnrzRvCm9wZOX4hmkFSZicx3g8N1CFVqeSXYAMlXnZfVsr1zB30A7LbjUgaF4XEUO1AVaW8XL8snvfg232
+
+Write the phrases previously mentioned to a file called result.txt.
+
+gPBX3I9EGxmOKs2c4SfFdTiGjqSDNGtTFfLqQIFZiB8m0ooZ2gX8BMyilVHcGjOXx7TjJiknrK0aCbrXQvyTgYQkzfYEFm0MPi285Gu41T4Ue3oBc6evhvl3erooIVC85ZViFi9fEXMl5uxEuTrSbXWaauRQqxoZnA8OCqdg04HqJcd7pdVd6uZk45JBRJq8TSGlY1QnNE5IfN6bcHYoPP41c2li8TM18GBzALIaGHM2AbWGdUWMnIMQs9Dtriv0Kzrh5lywwnd8blFiUqzNNJ8BGlOQZOoj2Qp73LjDUPcTYEkJGtRTVt6gbpRHECcS6aauG03Ntjm4Fr0CCdHUdvuiMoOG0VUg6e7i3jJHA2CQ8J7PNcVIfjB9B4gOjVIHXWoOmXM9zSrIpiNc6hwaC9QfenFh1txI2lqYyDz7geulivQRIC03EiSwpjvjPqXVySFVJONM7t85Y27NJzRHyfRlEKwlAZOtk0782gyjrPOjkrLgObI0
diff --git a/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt
new file mode 100644
index 000000000..bdab23d9b
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4_phrases/artifacts_out/result.txt
@@ -0,0 +1,4 @@
+The purple elephant danced on a rainbow while eating a taco
+The sneaky toaster stole my socks and ran away to Hawaii
+My pet rock sings better than Beyonce on Tuesdays
+The giant hamster rode a unicycle through the crowded mall
diff --git a/agbenchmark/challenges/memory/m4_phrases/data.json b/agbenchmark/challenges/memory/m4_phrases/data.json
new file mode 100644
index 000000000..da17fb02a
--- /dev/null
+++ b/agbenchmark/challenges/memory/m4_phrases/data.json
@@ -0,0 +1,24 @@
+{
+  "name": "TestRememberMultiplePhrasesWithNoise",
+  "category": ["memory"],
+  "task": "Follow the instructions in the instructions_1.txt file",
+  "dependencies": ["TestRememberMultipleWithNoise"],
+  "cutoff": 90,
+  "ground": {
+    "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyoncé on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+    "should_contain": [
+      "The purple elephant danced on a rainbow while eating a taco",
+      "The sneaky toaster stole my socks and ran away to Hawaii",
+      "My pet rock sings better than Beyonce on Tuesdays",
+      "The giant hamster rode a unicycle through the crowded mall"
+    ],
+    "should_not_contain": [],
+    "files": ["result.txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "advanced",
+    "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt
deleted file mode 100644
index f558a0f94..000000000
--- a/agbenchmark/challenges/retrieval/r1/artifacts_out/random_file.txt
+++ /dev/null
@@ -1 +0,0 @@
-25.89
diff --git a/agbenchmark/challenges/retrieval/r1/data.json b/agbenchmark/challenges/retrieval/r1/data.json
deleted file mode 100644
index c3af4862d..000000000
--- a/agbenchmark/challenges/retrieval/r1/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestBasicRetrieval",
-  "category": ["retrieval"],
-  "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
-  "dependencies": ["TestWriteFile", "TestSearch"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "£25.89",
-    "should_contain": ["25.89"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "info": {
-    "difficulty": "basic",
-    "description": "Tests ability to retrieve information from a website.",
-    "side_effects": []
-  }
-}
diff --git a/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt
new file mode 100644
index 000000000..f558a0f94
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r1_book_price/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+25.89
diff --git a/agbenchmark/challenges/retrieval/r1_book_price/data.json b/agbenchmark/challenges/retrieval/r1_book_price/data.json
new file mode 100644
index 000000000..9a6924daf
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r1_book_price/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestBasicRetrieval",
+  "category": ["retrieval"],
+  "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+  "dependencies": ["TestWriteFile", "TestSearch"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "£25.89",
+    "should_contain": ["25.89"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "basic",
+    "description": "Specifies specific website to retrieve website from.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt
new file mode 100644
index 000000000..8a0eae046
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2.1_specific/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+81,462 Millions
diff --git a/agbenchmark/challenges/retrieval/r2.1_specific/data.json b/agbenchmark/challenges/retrieval/r2.1_specific/data.json
new file mode 100644
index 000000000..3fcd50801
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2.1_specific/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRetrieval2.1",
+  "category": ["retrieval"],
+  "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+  "dependencies": ["TestRetrieval2.0"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "It was $81.462 billion in 2022.",
+    "should_contain": ["81", "462"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "This one checks the accuracy of the information over r2",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt
new file mode 100644
index 000000000..8a0eae046
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2.2_formatting/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+81,462 Millions
diff --git a/agbenchmark/challenges/retrieval/r2.2_formatting/data.json b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json
new file mode 100644
index 000000000..294effeff
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2.2_formatting/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRetrieval2.2",
+  "category": ["retrieval"],
+  "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+  "dependencies": ["TestRetrieval2.1"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+    "should_contain": ["81,462"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "intermediate",
+    "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt
deleted file mode 100644
index 8a0eae046..000000000
--- a/agbenchmark/challenges/retrieval/r2/artifacts_out/random_file.txt
+++ /dev/null
@@ -1 +0,0 @@
-81,462 Millions
diff --git a/agbenchmark/challenges/retrieval/r2/data.json b/agbenchmark/challenges/retrieval/r2/data.json
deleted file mode 100644
index f558b8584..000000000
--- a/agbenchmark/challenges/retrieval/r2/data.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "name": "TestRetrieval2",
-  "category": ["retrieval"],
-  "task": "Write tesla's revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": ["TestBasicRetrieval"],
-  "cutoff": 60,
-  "ground": {
-    "answer": "81,462",
-    "should_contain": ["81,462"],
-    "should_not_contain": [],
-    "files": [".txt"],
-    "type": "file"
-  },
-  "info": {
-    "difficulty": "novice",
-    "description": "Tests ability to retrieve information.",
-    "side_effects": ["tests if there is in fact an LLM attached"]
-  }
-}
diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt
new file mode 100644
index 000000000..8a0eae046
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/artifacts_out/random_file.txt
@@ -0,0 +1 @@
+81,462 Millions
diff --git a/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json
new file mode 100644
index 000000000..63665ed63
--- /dev/null
+++ b/agbenchmark/challenges/retrieval/r2_tesla_revenue/data.json
@@ -0,0 +1,19 @@
+{
+  "name": "TestRetrieval2.0",
+  "category": ["retrieval"],
+  "task": "Write tesla's revenue in 2022 into a .txt file.",
+  "dependencies": ["TestBasicRetrieval"],
+  "cutoff": 60,
+  "ground": {
+    "answer": "It was $81.462 billion in 2022.",
+    "should_contain": ["81"],
+    "should_not_contain": [],
+    "files": [".txt"],
+    "type": "file"
+  },
+  "info": {
+    "difficulty": "novice",
+    "description": "A no guardrails search for info",
+    "side_effects": []
+  }
+}
diff --git a/agbenchmark/challenges/retrieval/r3/data.json b/agbenchmark/challenges/retrieval/r3/data.json
index eb998ffbf..d3e4bd0f5 100644
--- a/agbenchmark/challenges/retrieval/r3/data.json
+++ b/agbenchmark/challenges/retrieval/r3/data.json
@@ -2,7 +2,7 @@
   "name": "TestRetrieval3",
   "category": ["retrieval"],
   "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-  "dependencies": ["TestRetrieval2"],
+  "dependencies": ["TestRetrieval2.1"],
   "cutoff": 60,
   "ground": {
     "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 4a62af0b5..b544d2c6e 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -135,8 +135,8 @@ internal_info = ReportManager(str(INTERNAL_LOGS_PATH / "internal_info.json"))
 
 
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
+    challenge_data = item.funcargs.get("challenge_data", None)
     if call.when == "call":
-        challenge_data = item.funcargs.get("challenge_data", None)
         difficulty = (
             challenge_data["info"]["difficulty"] if challenge_data else "unknown"
         )
@@ -157,6 +157,9 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
         info_details: Any = {
             "data_path": challenge_location,
             "is_regression": False,
+            "task": challenge_data["task"],
+            "answer": challenge_data["ground"]["answer"],
+            "description": challenge_data["info"]["description"],
             "metrics": {
                 "difficulty": difficulty,
                 "success": False,
@@ -218,6 +221,10 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
                     "run_time"
                 ] = f"{str(round(run_time, 3))} seconds"
 
+                info_details["reached_cutoff"] = (
+                    float(run_time) > challenge_data["cutoff"]
+                )
+
             info_manager.add_test(test_name, info_details)
 
 
diff --git a/agbenchmark/reports/internal_info.json b/agbenchmark/reports/internal_info.json
index 0bfad744a..d20e8c778 100644
--- a/agbenchmark/reports/internal_info.json
+++ b/agbenchmark/reports/internal_info.json
@@ -3,70 +3,83 @@
         "TestBasicMemory": [
             true,
             true,
-            true
-        ],
-        "TestBasicRetrieval": [
             true,
             true,
-            true
-        ],
-        "TestCreateSimpleWebServer": [
-            false,
-            false,
-            false
-        ],
-        "TestDebugSimpleTypoWithGuidance": [
-            false,
-            false,
-            false
-        ],
-        "TestDebugSimpleTypoWithoutGuidance": [
+            true,
             false,
             false,
-            false
+            true
         ],
-        "TestReadFile": [
+        "TestBasicRetrieval": [
             true,
             true,
             true,
             true,
             true
         ],
-        "TestRememberMultipleIds": [
+        "TestReadFile": [
             true,
             true,
-            true
-        ],
-        "TestRememberMultipleIdsWithNoise": [
             true,
             true,
             true
         ],
-        "TestRememberMultiplePhrasesWithNoise": [
+        "TestSearch": [
             true,
             true,
-            true
-        ],
-        "TestRetrieval2": [
             true,
             true,
             true
         ],
-        "TestRetrieval3": [
+        "TestWriteFile": [
+            true,
             true,
             true,
             true
         ],
-        "TestSearch": [
-            true,
-            true,
+        "TestRetrieval2.2": [
+            false,
+            false,
+            false,
+            false
+        ],
+        "TestRetrieval2.1": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            false
+        ],
+        "TestRetrieval2.0": [
             true,
+            false
+        ],
+        "TestRememberMultipleIds": [
+            false,
+            false,
             true
         ],
-        "TestWriteFile": [
-            true,
-            true,
+        "TestRememberMultipleIdsWithNoise": [
+            false
+        ],
+        "TestRememberMultipleWithNoise": [
+            false,
             true
+        ],
+        "TestRememberMultiplePhrasesWithNoise": [
+            false,
+            false,
+            false,
+            false,
+            false,
+            false
+        ],
+        "TestDebugSimpleTypoWithGuidance": [
+            false,
+            false,
+            false,
+            false
         ]
     }
 }
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
index 637c2d5c5..419052311 100644
--- a/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
+++ b/agbenchmark/reports/mini-agi/1.1_TestWriteFile.json
@@ -1,36 +1,27 @@
 {
+  "command": "agbenchmark start --test TestWriteFile",
+  "completion_time": "2023-07-17-13:34",
+  "metrics": {
+    "run_time": "23.83 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
     "TestWriteFile": {
-        "data_path": "agbenchmark/challenges/interface/write_file",
-        "is_regression": true,
-        "metrics": {
-            "difficulty": "interface",
-            "success": true,
-            "non_mock_success_%": 100.0,
-            "run_time": "0.009 seconds"
-        }
-    },
-    "additional": {
-        "model": "gpt-3.5-turbo"
-    },
-    "command": "agbenchmark start --test TestWriteFile",
-    "completion_time": "2023-07-17-09:54",
-    "config": {
-        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
-    },
-    "metrics": {
-        "run_time": "22.36 seconds",
-        "highest_difficulty": "interface: 1"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "metrics": {
-                "difficulty": "interface",
-                "success": true,
-                "success_%": 40.0,
-                "run_time": "22.169 seconds"
-            }
-        }
+      "data_path": "agbenchmark/challenges/interface/write_file",
+      "is_regression": true,
+      "reached_cutoff": false,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "23.627 seconds"
+      }
     }
-}
\ No newline at end of file
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json
new file mode 100644
index 000000000..811fd3e85
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/10.1_TestRememberMultipleWithNoise.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleWithNoise",
+  "completion_time": "2023-07-17-21:24",
+  "metrics": {
+    "run_time": "77.71 seconds",
+    "highest_difficulty": "intermediate: 4"
+  },
+  "tests": {
+    "TestRememberMultipleWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m3",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": true,
+        "success_%": 50.0,
+        "run_time": "77.397 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json
new file mode 100644
index 000000000..08c2b7075
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/10_TestRememberMultipleWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleWithNoise",
+  "completion_time": "2023-07-17-21:19",
+  "metrics": {
+    "run_time": "74.3 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultipleWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m3",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "74.059 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..0de6f003c
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.1_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:28",
+  "metrics": {
+    "run_time": "60.86 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "60.631 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..1d2abb8e7
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.2_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:32",
+  "metrics": {
+    "run_time": "73.04 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "72.736 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..1d256b8c0
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.3_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:34",
+  "metrics": {
+    "run_time": "81.59 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "81.374 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..e67a6ac3e
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.4_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:36",
+  "metrics": {
+    "run_time": "98.32 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "98.021 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..9e76704db
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11.5_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:42",
+  "metrics": {
+    "run_time": "303.13 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "302.919 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json
new file mode 100644
index 000000000..e98ca330e
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/11_TestRememberMultiplePhrasesWithNoise.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultiplePhrasesWithNoise",
+  "completion_time": "2023-07-17-21:27",
+  "metrics": {
+    "run_time": "77.72 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultiplePhrasesWithNoise": {
+      "data_path": "agbenchmark/challenges/memory/m4_phrases",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00c3\u00a9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+      "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+      "metrics": {
+        "difficulty": "advanced",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "77.491 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..9c9f3dc2a
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12.1_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+    "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+    "completion_time": "2023-07-17-21:46",
+    "metrics": {
+        "run_time": "87.21 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "86.967 seconds"
+            },
+            "reached_cutoff": true
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..4765201fb
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12.2_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+    "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+    "completion_time": "2023-07-17-21:47",
+    "metrics": {
+        "run_time": "48.52 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "48.208 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..ac2592f33
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12.3_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,28 @@
+{
+    "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+    "completion_time": "2023-07-17-21:55",
+    "metrics": {
+        "run_time": "54.95 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "54.741 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
diff --git a/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json
new file mode 100644
index 000000000..e84c6e9a8
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/12_TestDebugSimpleTypoWithGuidance.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
+  "completion_time": "2023-07-17-21:44",
+  "metrics": {
+    "run_time": "63.37 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestDebugSimpleTypoWithGuidance": {
+      "data_path": "agbenchmark/challenges/code/d1",
+      "is_regression": false,
+      "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+      "answer": "[0, 1] [2, 5] [0, 3]",
+      "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "63.125 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
index e64783190..6ac7d1045 100644
--- a/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
+++ b/agbenchmark/reports/mini-agi/1_TestWriteFIle.json
@@ -9,6 +9,7 @@
     "TestWriteFile": {
       "data_path": "agbenchmark/challenges/interface/write_file",
       "is_regression": false,
+      "reached_cutoff": false,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -18,8 +19,7 @@
     }
   },
   "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
     "model": "gpt-4"
diff --git a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
index b5d73af99..4758addf1 100644
--- a/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
+++ b/agbenchmark/reports/mini-agi/2.1_TestReadFile.json
@@ -9,6 +9,7 @@
     "TestReadFile": {
       "data_path": "agbenchmark/challenges/interface/read_file",
       "is_regression": true,
+      "reached_cutoff": true,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -21,7 +22,6 @@
     "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
-    "model": "gpt-4",
-    "reached_termination_time": true
+    "model": "gpt-3.5-turbo"
   }
 }
diff --git a/agbenchmark/reports/mini-agi/2_TestReadFile.json b/agbenchmark/reports/mini-agi/2_TestReadFile.json
index 869eaaac1..87c7956d6 100644
--- a/agbenchmark/reports/mini-agi/2_TestReadFile.json
+++ b/agbenchmark/reports/mini-agi/2_TestReadFile.json
@@ -9,6 +9,7 @@
     "TestReadFile": {
       "data_path": "agbenchmark/challenges/interface/read_file",
       "is_regression": true,
+      "reached_cutoff": false,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -18,8 +19,7 @@
     }
   },
   "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
     "model": "gpt-4"
diff --git a/agbenchmark/reports/mini-agi/3.1_TestSearch.json b/agbenchmark/reports/mini-agi/3.1_TestSearch.json
new file mode 100644
index 000000000..6a2744e72
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/3.1_TestSearch.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestSearch",
+  "completion_time": "2023-07-17-13:35",
+  "metrics": {
+    "run_time": "20.58 seconds",
+    "highest_difficulty": "interface: 1"
+  },
+  "tests": {
+    "TestSearch": {
+      "data_path": "agbenchmark/challenges/interface/search",
+      "is_regression": true,
+      "reached_cutoff": false,
+      "metrics": {
+        "difficulty": "interface",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "20.367 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/3_TestSearch.json b/agbenchmark/reports/mini-agi/3_TestSearch.json
index d9d05db4a..c7d6c4309 100644
--- a/agbenchmark/reports/mini-agi/3_TestSearch.json
+++ b/agbenchmark/reports/mini-agi/3_TestSearch.json
@@ -9,6 +9,7 @@
     "TestSearch": {
       "data_path": "agbenchmark/challenges/interface/search",
       "is_regression": true,
+      "reached_cutoff": false,
       "metrics": {
         "difficulty": "interface",
         "success": true,
@@ -18,8 +19,7 @@
     }
   },
   "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
   },
   "additional": {
     "model": "gpt-4"
diff --git a/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json
new file mode 100644
index 000000000..6ff0fa63b
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4.1_TestBasicRetrieval.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestBasicRetrieval",
+  "completion_time": "2023-07-17-13:31",
+  "metrics": {
+    "run_time": "26.05 seconds",
+    "highest_difficulty": "basic: 2"
+  },
+  "tests": {
+    "TestBasicRetrieval": {
+      "data_path": "agbenchmark/challenges/retrieval/r1",
+      "is_regression": true,
+      "reached_cutoff": false,
+      "metrics": {
+        "difficulty": "basic",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "25.818 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
deleted file mode 100644
index d72d599d8..000000000
--- a/agbenchmark/reports/mini-agi/4.1_TestDebugSimpleTypoWithGuidance.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
-  "completion_time": "2023-07-15-22:16",
-  "metrics": {
-    "run_time": "45.92 seconds",
-    "highest_difficulty": ": 0"
-  },
-  "tests": {
-    "TestDebugSimpleTypoWithGuidance": {
-      "data_path": "agbenchmark/challenges/code/d1",
-      "is_regression": false,
-      "metrics": {
-        "difficulty": "basic",
-        "success": false,
-        "fail_reason": "assert 1 in [0.0]",
-        "success_%": 0.0,
-        "run_time": "45.599 seconds"
-      }
-    }
-  },
-  "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
-  },
-  "additional": {
-    "model": "gpt-4"
-  }
-}
diff --git a/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json
new file mode 100644
index 000000000..54c4fdcca
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/4_TestBasicRetrieval.json
@@ -0,0 +1,27 @@
+{
+  "command": "agbenchmark start --test TestBasicRetrieval",
+  "completion_time": "2023-07-17-13:22",
+  "metrics": {
+    "run_time": "61.24 seconds",
+    "highest_difficulty": "basic: 2"
+  },
+  "tests": {
+    "TestBasicRetrieval": {
+      "data_path": "agbenchmark/challenges/retrieval/r1",
+      "is_regression": true,
+      "reached_cutoff": true,
+      "metrics": {
+        "difficulty": "basic",
+        "success": true,
+        "success_%": 100.0,
+        "run_time": "60.872 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json b/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
deleted file mode 100644
index 7985a7843..000000000
--- a/agbenchmark/reports/mini-agi/4_TestDebugSimpleTypoWithGuidance.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "command": "agbenchmark start --test TestDebugSimpleTypoWithGuidance",
-  "completion_time": "2023-07-15-22:15",
-  "metrics": {
-    "run_time": "32.99 seconds",
-    "highest_difficulty": ": 0"
-  },
-  "tests": {
-    "TestDebugSimpleTypoWithGuidance": {
-      "data_path": "agbenchmark/challenges/code/d1",
-      "is_regression": false,
-      "metrics": {
-        "difficulty": "basic",
-        "success": false,
-        "fail_reason": "assert 1 in [0.0]",
-        "success_%": 0.0,
-        "run_time": "32.582 seconds"
-      }
-    }
-  },
-  "config": {
-    "workspace": "${os.path.join(Path.home(), 'miniagi')}",
-    "entry_path": "agbenchmark.benchmarks"
-  },
-  "additional": {
-    "model": "gpt-4"
-  }
-}
diff --git a/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json
new file mode 100644
index 000000000..4149ebe70
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/5.1_TestRetrieval2.0.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.0",
+  "completion_time": "2023-07-17-17:10",
+  "metrics": {
+    "run_time": "66.81 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.0": {
+      "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+      "is_regression": false,
+      "reached_cutoff": true,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "A no guardrails search for info",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "66.547 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json
new file mode 100644
index 000000000..28d091d28
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/5_TestRetrieval2.0.json
@@ -0,0 +1,29 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2",
+  "completion_time": "2023-07-17-13:54",
+  "metrics": {
+    "run_time": "36 seconds",
+    "highest_difficulty": "TestRetrieval2: 3"
+  },
+  "tests": {
+    "TestRetrieval2": {
+      "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+      "is_regression": false,
+      "reached_cutoff": false,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "A no guardrails search for info",
+      "metrics": {
+        "difficulty": "novice",
+        "success": true,
+        "success_%": 50.0,
+        "run_time": "35.59 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json
new file mode 100644
index 000000000..ed3ede1d3
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.1_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:27",
+  "metrics": {
+    "run_time": "64.44 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "reached_cutoff:": true,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0, 0.0]",
+        "success_%": 0.0,
+        "run_time": "64.216 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json
new file mode 100644
index 000000000..04f972329
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.2_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:53",
+  "metrics": {
+    "run_time": "30.08 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "29.711 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json
new file mode 100644
index 000000000..383774347
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.3_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:54",
+  "metrics": {
+    "run_time": "27.49 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "27.266 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json
new file mode 100644
index 000000000..71cd9e007
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6.4_TestRetrieval2.1.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-17:56",
+  "metrics": {
+    "run_time": "23.64 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "23.42 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json
new file mode 100644
index 000000000..1dceec03d
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/6_TestRetrieval2.1.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.1",
+  "completion_time": "2023-07-17-14:03",
+  "metrics": {
+    "run_time": "68.39 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.1": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+      "is_regression": false,
+      "reached_cutoff": true,
+      "answer": "It was $81.462 billion in 2022.",
+      "description": "This one checks the accuracy of the information over r2",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "assert 1 in []",
+        "success_%": 0.0,
+        "run_time": "68.15 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json
new file mode 100644
index 000000000..99373f7f1
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/7.1_TestRetrieval2.2.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.2",
+  "completion_time": "2023-07-17-17:57",
+  "metrics": {
+    "run_time": "31.1 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.2": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+      "is_regression": false,
+      "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+      "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+      "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "30.888 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json
new file mode 100644
index 000000000..ccdca26b3
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/7_TestRetrieval2.2.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRetrieval2.2",
+  "completion_time": "2023-07-17-14:04",
+  "metrics": {
+    "run_time": "28.08 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRetrieval2.2": {
+      "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+      "is_regression": false,
+      "reached_cutoff": false,
+      "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+      "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+      "metrics": {
+        "difficulty": "intermediate",
+        "success": false,
+        "fail_reason": "assert 1 in [0.0]",
+        "success_%": 0.0,
+        "run_time": "27.857 seconds"
+      }
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json
new file mode 100644
index 000000000..66cc2f9ae
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/8.1_TestBasicMemory.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestBasicMemory",
+  "completion_time": "2023-07-17-18:22",
+  "metrics": {
+    "run_time": "53.48 seconds",
+    "highest_difficulty": "basic: 2"
+  },
+  "tests": {
+    "TestBasicMemory": {
+      "data_path": "agbenchmark/challenges/memory/m1",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "2314",
+      "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+      "metrics": {
+        "difficulty": "basic",
+        "success": true,
+        "success_%": 75.0,
+        "run_time": "53.252 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/8_TestBasicMemory.json b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json
new file mode 100644
index 000000000..7ce535507
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/8_TestBasicMemory.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestBasicMemory",
+  "completion_time": "2023-07-17-18:18",
+  "metrics": {
+    "run_time": "62.11 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestBasicMemory": {
+      "data_path": "agbenchmark/challenges/memory/m1",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "2314",
+      "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+      "metrics": {
+        "difficulty": "basic",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\results.txt'",
+        "success_%": 83.33,
+        "run_time": "61.879 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json
new file mode 100644
index 000000000..462e73900
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/9.1_TestRememberMultipleIds.json
@@ -0,0 +1,30 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleIds",
+  "completion_time": "2023-07-17-20:22",
+  "metrics": {
+    "run_time": "57.6 seconds",
+    "highest_difficulty": "novice: 3"
+  },
+  "tests": {
+    "TestRememberMultipleIds": {
+      "data_path": "agbenchmark/challenges/memory/m2_noise",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+      "metrics": {
+        "difficulty": "novice",
+        "success": true,
+        "success_%": 33.33,
+        "run_time": "57.355 seconds"
+      },
+      "reached_cutoff": false
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-4"
+  }
+}
diff --git a/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json
new file mode 100644
index 000000000..aa726196e
--- /dev/null
+++ b/agbenchmark/reports/mini-agi/9_TestRememberMultipleIds.json
@@ -0,0 +1,31 @@
+{
+  "command": "agbenchmark start --test TestRememberMultipleIds",
+  "completion_time": "2023-07-17-18:33",
+  "metrics": {
+    "run_time": "61.32 seconds",
+    "highest_difficulty": "No successful tests"
+  },
+  "tests": {
+    "TestRememberMultipleIds": {
+      "data_path": "agbenchmark/challenges/memory/m2_noise",
+      "is_regression": false,
+      "task": "Follow the instructions in the instructions_1.txt file",
+      "answer": "3145\n3791\n9317\n9471",
+      "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+      "metrics": {
+        "difficulty": "novice",
+        "success": false,
+        "fail_reason": "[Errno 2] No such file or directory: 'C:\\\\Users\\\\silen\\\\miniagi\\\\result.txt'",
+        "success_%": 0.0,
+        "run_time": "61.089 seconds"
+      },
+      "reached_cutoff": true
+    }
+  },
+  "config": {
+    "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+  },
+  "additional": {
+    "model": "gpt-3.5-turbo"
+  }
+}
diff --git a/agbenchmark/utils.py b/agbenchmark/utils.py
index f1ed43639..63f4d836c 100644
--- a/agbenchmark/utils.py
+++ b/agbenchmark/utils.py
@@ -55,7 +55,6 @@ def calculate_info_test_path(reports_path: Path) -> str:
                 all_prefix_numbers.append(math.floor(number))
 
             max_prefix = max(all_prefix_numbers, default=0)
-            print("HEY WE ARE HERE BIG DAWG", max_prefix)
             run_name = f"{max_prefix + 1}_{test_arg}.json"
         else:
             # Take the number from before the _ and add the .{number}
@@ -118,7 +117,9 @@ def get_highest_success_difficulty(data: dict) -> str:
     else:
         highest_difficulty_str = ""
 
-    return f"{highest_difficulty_str}: {highest_difficulty_level}"
+    if highest_difficulty_level:
+        return f"{highest_difficulty_str}: {highest_difficulty_level}"
+    return "No successful tests"
 
 
 def assign_paths(folder_path: Path) -> tuple[str, str, str]:
diff --git a/agent/mini-agi b/agent/mini-agi
index 0a9fcd8c3..4a346ab7c 160000
--- a/agent/mini-agi
+++ b/agent/mini-agi
@@ -1 +1 @@
-Subproject commit 0a9fcd8c3d6352ef42d436cff7b64683a7a7ca2d
+Subproject commit 4a346ab7cb8dbcfd3bf2cee49448d26e01406ba3
-- 
cgit v1.2.3


From 328643e5f2bb6f447b02b6fe5779eea6dcd0db59 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 02:44:35 +0000
Subject: smol-developer-20230718024435

---
 reports/smol-developer/file2_07-18-02-43.json | 266 ++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 reports/smol-developer/file2_07-18-02-43.json

diff --git a/reports/smol-developer/file2_07-18-02-43.json b/reports/smol-developer/file2_07-18-02-43.json
new file mode 100644
index 000000000..c07c322be
--- /dev/null
+++ b/reports/smol-developer/file2_07-18-02-43.json
@@ -0,0 +1,266 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-02:44",
+    "metrics": {
+        "run_time": "38.24 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "6.509 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "9.258 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "13.455 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "7.264 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.572 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 0eb04c1e342e7b937f86797884dffff5972c8adb Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 02:45:45 +0000
Subject: Auto-GPT-20230718024545

---
 reports/Auto-GPT/file2_07-18-02-45.json | 268 ++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 reports/Auto-GPT/file2_07-18-02-45.json

diff --git a/reports/Auto-GPT/file2_07-18-02-45.json b/reports/Auto-GPT/file2_07-18-02-45.json
new file mode 100644
index 000000000..f0cc9b962
--- /dev/null
+++ b/reports/Auto-GPT/file2_07-18-02-45.json
@@ -0,0 +1,268 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-02:45",
+    "metrics": {
+        "run_time": "26.11 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "25.778 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.006 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 857b72df87a0a9e7977199369ece51c540d1e6e4 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 02:46:36 +0000
Subject: gpt-engineer-20230718024636

---
 reports/gpt-engineer/file2_07-18-02-44.json | 267 ++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 reports/gpt-engineer/file2_07-18-02-44.json

diff --git a/reports/gpt-engineer/file2_07-18-02-44.json b/reports/gpt-engineer/file2_07-18-02-44.json
new file mode 100644
index 000000000..1c24f4f8c
--- /dev/null
+++ b/reports/gpt-engineer/file2_07-18-02-44.json
@@ -0,0 +1,267 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-02:46",
+    "metrics": {
+        "run_time": "123.02 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "72.83 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'",
+                "success_%": 0.0,
+                "run_time": "47.884 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
+                "success_%": 0.0,
+                "run_time": "0.955 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "0.944 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.003 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 86978b59246de8cce22928f653974fca51fc358c Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 02:51:30 +0000
Subject: mini-agi-20230718025130

---
 reports/mini-agi/file1_07-18-02-44.json | 260 ++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 reports/mini-agi/file1_07-18-02-44.json

diff --git a/reports/mini-agi/file1_07-18-02-44.json b/reports/mini-agi/file1_07-18-02-44.json
new file mode 100644
index 000000000..a8e9f0fe4
--- /dev/null
+++ b/reports/mini-agi/file1_07-18-02-44.json
@@ -0,0 +1,260 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-02:51",
+    "metrics": {
+        "run_time": "407.24 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": true,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "14.551 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "37.551 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": true,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "19.674 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": true,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "21.582 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": true,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "23.659 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 20.0,
+                "run_time": "45.503 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "[Errno 2] No such file or directory: '/home/runner/miniagi/result.txt'",
+                "success_%": 66.67,
+                "run_time": "69.968 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 66.67,
+                "run_time": "30.055 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "57.289 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "63.121 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 25.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "24.052 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 33.33,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From e5856588653303244ce769a7d5d70320f3048806 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 08:21:11 +0000
Subject: gpt-engineer-20230718082111

---
 reports/gpt-engineer/file3_07-18-08-19.json | 267 ++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 reports/gpt-engineer/file3_07-18-08-19.json

diff --git a/reports/gpt-engineer/file3_07-18-08-19.json b/reports/gpt-engineer/file3_07-18-08-19.json
new file mode 100644
index 000000000..10e4cf79e
--- /dev/null
+++ b/reports/gpt-engineer/file3_07-18-08-19.json
@@ -0,0 +1,267 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-08:21",
+    "metrics": {
+        "run_time": "123.71 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "65.886 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'",
+                "success_%": 0.0,
+                "run_time": "55.938 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "0.788 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
+                "success_%": 0.0,
+                "run_time": "0.787 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From b2f52f08f4f66beb2ce7de15801a50feb1836c7b Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 08:22:55 +0000
Subject: Auto-GPT-20230718082255

---
 reports/Auto-GPT/file3_07-18-08-19.json | 267 ++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 reports/Auto-GPT/file3_07-18-08-19.json

diff --git a/reports/Auto-GPT/file3_07-18-08-19.json b/reports/Auto-GPT/file3_07-18-08-19.json
new file mode 100644
index 000000000..653f7a8da
--- /dev/null
+++ b/reports/Auto-GPT/file3_07-18-08-19.json
@@ -0,0 +1,267 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-08:22",
+    "metrics": {
+        "run_time": "202.62 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "36.149 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "81.97 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "23.569 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "60.708 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 15d50a5ccb8b59bbee5e88c579726f4487a0eca3 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 08:24:43 +0000
Subject: smol-developer-20230718082443

---
 reports/smol-developer/file3_07-18-08-19.json | 266 ++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 reports/smol-developer/file3_07-18-08-19.json

diff --git a/reports/smol-developer/file3_07-18-08-19.json b/reports/smol-developer/file3_07-18-08-19.json
new file mode 100644
index 000000000..7124e24c3
--- /dev/null
+++ b/reports/smol-developer/file3_07-18-08-19.json
@@ -0,0 +1,266 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-08:24",
+    "metrics": {
+        "run_time": "315.94 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "5.447 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "286.755 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "12.291 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "9.912 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.322 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From d46124a9d82eb3d1fe4aa53acd165dc4a9817820 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Tue, 18 Jul 2023 09:17:45 -0700
Subject: Push reports to google drive (#167)

Signed-off-by: Merwane Hamadi <merwanehamadi@gmail.com>
---
 .github/workflows/ci.yml |   3 +
 .gitignore               |   1 +
 json_to_base_64.py       |  17 ++
 poetry.lock              | 753 ++++++++++++++++++++++++++++++++++-------------
 pyproject.toml           |   3 +
 send_to_googledrive.py   | 112 +++++++
 6 files changed, 682 insertions(+), 207 deletions(-)
 create mode 100644 json_to_base_64.py
 create mode 100644 send_to_googledrive.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3b0dc50fe..e34b2e864 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -191,3 +191,6 @@ jobs:
           git fetch origin $current_branch
           git rebase origin/$current_branch
           git push origin HEAD
+          poetry run python send_to_googledrive.py
+        env:
+          GDRIVE_BASE64: ${{ secrets.GDRIVE_BASE64 }}
diff --git a/.gitignore b/.gitignore
index 7d0419ca4..1b0f3ba14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,4 @@ cython_debug/
 .idea/
 .DS_Store
 ```
+secrets.json
diff --git a/json_to_base_64.py b/json_to_base_64.py
new file mode 100644
index 000000000..42cbc4df8
--- /dev/null
+++ b/json_to_base_64.py
@@ -0,0 +1,17 @@
+import base64
+import json
+
+# Load JSON data from a file
+with open("secrets.json", "r") as f:
+    data = json.load(f)
+
+# Convert the JSON object into a string
+json_string = json.dumps(data)
+
+# Encode the string into bytes
+json_bytes = json_string.encode("utf-8")
+
+# Convert the bytes to a base64 string
+base64_string = base64.b64encode(json_bytes).decode("utf-8")
+
+print(base64_string)
diff --git a/poetry.lock b/poetry.lock
index ad72f5e10..5b51cb14e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -217,6 +217,18 @@ d = ["aiohttp (>=3.7.4)"]
 jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
 uvloop = ["uvloop (>=0.15.2)"]
 
+[[package]]
+name = "cachetools"
+version = "5.3.1"
+description = "Extensible memoizing collections and decorators"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"},
+    {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"},
+]
+
 [[package]]
 name = "certifi"
 version = "2023.5.7"
@@ -231,99 +243,99 @@ files = [
 
 [[package]]
 name = "charset-normalizer"
-version = "3.1.0"
+version = "3.2.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 category = "main"
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"},
-    {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"},
-    {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"},
-    {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"},
-    {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"},
-    {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"},
-    {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"},
+    {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"},
+    {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"},
+    {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"},
+    {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"},
+    {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"},
+    {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"},
+    {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
 ]
 
 [[package]]
 name = "click"
-version = "8.1.3"
+version = "8.1.5"
 description = "Composable command line interface toolkit"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
-    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
+    {file = "click-8.1.5-py3-none-any.whl", hash = "sha256:e576aa487d679441d7d30abb87e1b43d24fc53bffb8758443b1a9e1cee504548"},
+    {file = "click-8.1.5.tar.gz", hash = "sha256:4be4b1af8d665c6d942909916d31a213a106800c47d0eeba73d34da3cbc11367"},
 ]
 
 [package.dependencies]
@@ -343,14 +355,14 @@ files = [
 
 [[package]]
 name = "exceptiongroup"
-version = "1.1.1"
+version = "1.1.2"
 description = "Backport of PEP 654 (exception groups)"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
-    {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
+    {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"},
+    {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"},
 ]
 
 [package.extras]
@@ -375,86 +387,73 @@ pyflakes = ">=2.3.0,<2.4.0"
 
 [[package]]
 name = "frozenlist"
-version = "1.3.3"
+version = "1.4.0"
 description = "A list-like structure which implements collections.abc.MutableSequence"
 category = "main"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff8bf625fe85e119553b5383ba0fb6aa3d0ec2ae980295aaefa552374926b3f4"},
-    {file = "frozenlist-1.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dfbac4c2dfcc082fcf8d942d1e49b6aa0766c19d3358bd86e2000bf0fa4a9cf0"},
-    {file = "frozenlist-1.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b1c63e8d377d039ac769cd0926558bb7068a1f7abb0f003e3717ee003ad85530"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7fdfc24dcfce5b48109867c13b4cb15e4660e7bd7661741a391f821f23dfdca7"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2c926450857408e42f0bbc295e84395722ce74bae69a3b2aa2a65fe22cb14b99"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1841e200fdafc3d51f974d9d377c079a0694a8f06de2e67b48150328d66d5483"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f470c92737afa7d4c3aacc001e335062d582053d4dbe73cda126f2d7031068dd"},
-    {file = "frozenlist-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:783263a4eaad7c49983fe4b2e7b53fa9770c136c270d2d4bbb6d2192bf4d9caf"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:924620eef691990dfb56dc4709f280f40baee568c794b5c1885800c3ecc69816"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ae4dc05c465a08a866b7a1baf360747078b362e6a6dbeb0c57f234db0ef88ae0"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:bed331fe18f58d844d39ceb398b77d6ac0b010d571cba8267c2e7165806b00ce"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:02c9ac843e3390826a265e331105efeab489ffaf4dd86384595ee8ce6d35ae7f"},
-    {file = "frozenlist-1.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9545a33965d0d377b0bc823dcabf26980e77f1b6a7caa368a365a9497fb09420"},
-    {file = "frozenlist-1.3.3-cp310-cp310-win32.whl", hash = "sha256:d5cd3ab21acbdb414bb6c31958d7b06b85eeb40f66463c264a9b343a4e238642"},
-    {file = "frozenlist-1.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b756072364347cb6aa5b60f9bc18e94b2f79632de3b0190253ad770c5df17db1"},
-    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b4395e2f8d83fbe0c627b2b696acce67868793d7d9750e90e39592b3626691b7"},
-    {file = "frozenlist-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:14143ae966a6229350021384870458e4777d1eae4c28d1a7aa47f24d030e6678"},
-    {file = "frozenlist-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d8860749e813a6f65bad8285a0520607c9500caa23fea6ee407e63debcdbef6"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23d16d9f477bb55b6154654e0e74557040575d9d19fe78a161bd33d7d76808e8"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eb82dbba47a8318e75f679690190c10a5e1f447fbf9df41cbc4c3afd726d88cb"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9309869032abb23d196cb4e4db574232abe8b8be1339026f489eeb34a4acfd91"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a97b4fe50b5890d36300820abd305694cb865ddb7885049587a5678215782a6b"},
-    {file = "frozenlist-1.3.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c188512b43542b1e91cadc3c6c915a82a5eb95929134faf7fd109f14f9892ce4"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:303e04d422e9b911a09ad499b0368dc551e8c3cd15293c99160c7f1f07b59a48"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0771aed7f596c7d73444c847a1c16288937ef988dc04fb9f7be4b2aa91db609d"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:66080ec69883597e4d026f2f71a231a1ee9887835902dbe6b6467d5a89216cf6"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:41fe21dc74ad3a779c3d73a2786bdf622ea81234bdd4faf90b8b03cad0c2c0b4"},
-    {file = "frozenlist-1.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f20380df709d91525e4bee04746ba612a4df0972c1b8f8e1e8af997e678c7b81"},
-    {file = "frozenlist-1.3.3-cp311-cp311-win32.whl", hash = "sha256:f30f1928162e189091cf4d9da2eac617bfe78ef907a761614ff577ef4edfb3c8"},
-    {file = "frozenlist-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:a6394d7dadd3cfe3f4b3b186e54d5d8504d44f2d58dcc89d693698e8b7132b32"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8df3de3a9ab8325f94f646609a66cbeeede263910c5c0de0101079ad541af332"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0693c609e9742c66ba4870bcee1ad5ff35462d5ffec18710b4ac89337ff16e27"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cd4210baef299717db0a600d7a3cac81d46ef0e007f88c9335db79f8979c0d3d"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:394c9c242113bfb4b9aa36e2b80a05ffa163a30691c7b5a29eba82e937895d5e"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6327eb8e419f7d9c38f333cde41b9ae348bec26d840927332f17e887a8dcb70d"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e24900aa13212e75e5b366cb9065e78bbf3893d4baab6052d1aca10d46d944c"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3843f84a6c465a36559161e6c59dce2f2ac10943040c2fd021cfb70d58c4ad56"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:84610c1502b2461255b4c9b7d5e9c48052601a8957cd0aea6ec7a7a1e1fb9420"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c21b9aa40e08e4f63a2f92ff3748e6b6c84d717d033c7b3438dd3123ee18f70e"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:efce6ae830831ab6a22b9b4091d411698145cb9b8fc869e1397ccf4b4b6455cb"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:40de71985e9042ca00b7953c4f41eabc3dc514a2d1ff534027f091bc74416401"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-win32.whl", hash = "sha256:180c00c66bde6146a860cbb81b54ee0df350d2daf13ca85b275123bbf85de18a"},
-    {file = "frozenlist-1.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:9bbbcedd75acdfecf2159663b87f1bb5cfc80e7cd99f7ddd9d66eb98b14a8411"},
-    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:034a5c08d36649591be1cbb10e09da9f531034acfe29275fc5454a3b101ce41a"},
-    {file = "frozenlist-1.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ba64dc2b3b7b158c6660d49cdb1d872d1d0bf4e42043ad8d5006099479a194e5"},
-    {file = "frozenlist-1.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:47df36a9fe24054b950bbc2db630d508cca3aa27ed0566c0baf661225e52c18e"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:008a054b75d77c995ea26629ab3a0c0d7281341f2fa7e1e85fa6153ae29ae99c"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:841ea19b43d438a80b4de62ac6ab21cfe6827bb8a9dc62b896acc88eaf9cecba"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e235688f42b36be2b6b06fc37ac2126a73b75fb8d6bc66dd632aa35286238703"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca713d4af15bae6e5d79b15c10c8522859a9a89d3b361a50b817c98c2fb402a2"},
-    {file = "frozenlist-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ac5995f2b408017b0be26d4a1d7c61bce106ff3d9e3324374d66b5964325448"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4ae8135b11652b08a8baf07631d3ebfe65a4c87909dbef5fa0cdde440444ee4"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4ea42116ceb6bb16dbb7d526e242cb6747b08b7710d9782aa3d6732bd8d27649"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:810860bb4bdce7557bc0febb84bbd88198b9dbc2022d8eebe5b3590b2ad6c842"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ee78feb9d293c323b59a6f2dd441b63339a30edf35abcb51187d2fc26e696d13"},
-    {file = "frozenlist-1.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0af2e7c87d35b38732e810befb9d797a99279cbb85374d42ea61c1e9d23094b3"},
-    {file = "frozenlist-1.3.3-cp38-cp38-win32.whl", hash = "sha256:899c5e1928eec13fd6f6d8dc51be23f0d09c5281e40d9cf4273d188d9feeaf9b"},
-    {file = "frozenlist-1.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:7f44e24fa70f6fbc74aeec3e971f60a14dde85da364aa87f15d1be94ae75aeef"},
-    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2b07ae0c1edaa0a36339ec6cce700f51b14a3fc6545fdd32930d2c83917332cf"},
-    {file = "frozenlist-1.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ebb86518203e12e96af765ee89034a1dbb0c3c65052d1b0c19bbbd6af8a145e1"},
-    {file = "frozenlist-1.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5cf820485f1b4c91e0417ea0afd41ce5cf5965011b3c22c400f6d144296ccbc0"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c11e43016b9024240212d2a65043b70ed8dfd3b52678a1271972702d990ac6d"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8fa3c6e3305aa1146b59a09b32b2e04074945ffcfb2f0931836d103a2c38f936"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:352bd4c8c72d508778cf05ab491f6ef36149f4d0cb3c56b1b4302852255d05d5"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:65a5e4d3aa679610ac6e3569e865425b23b372277f89b5ef06cf2cdaf1ebf22b"},
-    {file = "frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e2c1185858d7e10ff045c496bbf90ae752c28b365fef2c09cf0fa309291669"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f163d2fd041c630fed01bc48d28c3ed4a3b003c00acd396900e11ee5316b56bb"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:05cdb16d09a0832eedf770cb7bd1fe57d8cf4eaf5aced29c4e41e3f20b30a784"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8bae29d60768bfa8fb92244b74502b18fae55a80eac13c88eb0b496d4268fd2d"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eedab4c310c0299961ac285591acd53dc6723a1ebd90a57207c71f6e0c2153ab"},
-    {file = "frozenlist-1.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3bbdf44855ed8f0fbcd102ef05ec3012d6a4fd7c7562403f76ce6a52aeffb2b1"},
-    {file = "frozenlist-1.3.3-cp39-cp39-win32.whl", hash = "sha256:efa568b885bca461f7c7b9e032655c0c143d305bf01c30caf6db2854a4532b38"},
-    {file = "frozenlist-1.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:cfe33efc9cb900a4c46f91a5ceba26d6df370ffddd9ca386eb1d4f0ad97b9ea9"},
-    {file = "frozenlist-1.3.3.tar.gz", hash = "sha256:58bcc55721e8a90b88332d6cd441261ebb22342e238296bb330968952fbb3a6a"},
+    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"},
+    {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"},
+    {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"},
+    {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"},
+    {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"},
+    {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"},
+    {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"},
+    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"},
+    {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"},
+    {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"},
+    {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"},
+    {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"},
+    {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"},
+    {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"},
+    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"},
+    {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"},
+    {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"},
+    {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"},
+    {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"},
+    {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"},
+    {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"},
+    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"},
+    {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"},
+    {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"},
+    {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"},
+    {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"},
+    {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"},
+    {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"},
+    {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"},
 ]
 
 [[package]]
@@ -472,6 +471,82 @@ files = [
 [package.extras]
 rewrite = ["tokenize-rt (>=3)"]
 
+[[package]]
+name = "google-auth"
+version = "2.22.0"
+description = "Google Authentication Library"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "google-auth-2.22.0.tar.gz", hash = "sha256:164cba9af4e6e4e40c3a4f90a1a6c12ee56f14c0b4868d1ca91b32826ab334ce"},
+    {file = "google_auth-2.22.0-py2.py3-none-any.whl", hash = "sha256:d61d1b40897407b574da67da1a833bdc10d5a11642566e506565d1b1a46ba873"},
+]
+
+[package.dependencies]
+cachetools = ">=2.0.0,<6.0"
+pyasn1-modules = ">=0.2.1"
+rsa = ">=3.1.4,<5"
+six = ">=1.9.0"
+urllib3 = "<2.0"
+
+[package.extras]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "requests (>=2.20.0,<3.0.0.dev0)"]
+enterprise-cert = ["cryptography (==36.0.2)", "pyopenssl (==22.0.0)"]
+pyopenssl = ["cryptography (>=38.0.3)", "pyopenssl (>=20.0.0)"]
+reauth = ["pyu2f (>=0.1.5)"]
+requests = ["requests (>=2.20.0,<3.0.0.dev0)"]
+
+[[package]]
+name = "google-auth-oauthlib"
+version = "1.0.0"
+description = "Google Authentication Library"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "google-auth-oauthlib-1.0.0.tar.gz", hash = "sha256:e375064964820b47221a7e1b7ee1fd77051b6323c3f9e3e19785f78ab67ecfc5"},
+    {file = "google_auth_oauthlib-1.0.0-py2.py3-none-any.whl", hash = "sha256:95880ca704928c300f48194d1770cf5b1462835b6e49db61445a520f793fd5fb"},
+]
+
+[package.dependencies]
+google-auth = ">=2.15.0"
+requests-oauthlib = ">=0.7.0"
+
+[package.extras]
+tool = ["click (>=6.0.0)"]
+
+[[package]]
+name = "gspread"
+version = "5.10.0"
+description = "Google Spreadsheets Python API"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "gspread-5.10.0-py3-none-any.whl", hash = "sha256:f58b15d950ef5c45c8607edb3687188d5a543f2b66242f9c26fbb0d2cf36169d"},
+    {file = "gspread-5.10.0.tar.gz", hash = "sha256:2b6bba6dc111580170346a9bcd1893e0e8c52f67a9e537caec7b7a1e27c14435"},
+]
+
+[package.dependencies]
+google-auth = ">=1.12.0"
+google-auth-oauthlib = ">=0.4.1"
+
+[[package]]
+name = "httplib2"
+version = "0.22.0"
+description = "A comprehensive HTTP client library."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc"},
+    {file = "httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81"},
+]
+
+[package.dependencies]
+pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""}
+
 [[package]]
 name = "idna"
 version = "3.4"
@@ -682,6 +757,77 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
 extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
 test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
+[[package]]
+name = "numpy"
+version = "1.25.1"
+description = "Fundamental package for array computing in Python"
+category = "dev"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d339465dff3eb33c701430bcb9c325b60354698340229e1dff97745e6b3efa"},
+    {file = "numpy-1.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d736b75c3f2cb96843a5c7f8d8ccc414768d34b0a75f466c05f3a739b406f10b"},
+    {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a90725800caeaa160732d6b31f3f843ebd45d6b5f3eec9e8cc287e30f2805bf"},
+    {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6c9261d21e617c6dc5eacba35cb68ec36bb72adcff0dee63f8fbc899362588"},
+    {file = "numpy-1.25.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0def91f8af6ec4bb94c370e38c575855bf1d0be8a8fbfba42ef9c073faf2cf19"},
+    {file = "numpy-1.25.1-cp310-cp310-win32.whl", hash = "sha256:fd67b306320dcadea700a8f79b9e671e607f8696e98ec255915c0c6d6b818503"},
+    {file = "numpy-1.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:c1516db588987450b85595586605742879e50dcce923e8973f79529651545b57"},
+    {file = "numpy-1.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b82655dd8efeea69dbf85d00fca40013d7f503212bc5259056244961268b66e"},
+    {file = "numpy-1.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f6049c4878cb16960fbbfb22105e49d13d752d4d8371b55110941fb3b17800"},
+    {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41a56b70e8139884eccb2f733c2f7378af06c82304959e174f8e7370af112e09"},
+    {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5154b1a25ec796b1aee12ac1b22f414f94752c5f94832f14d8d6c9ac40bcca6"},
+    {file = "numpy-1.25.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38eb6548bb91c421261b4805dc44def9ca1a6eef6444ce35ad1669c0f1a3fc5d"},
+    {file = "numpy-1.25.1-cp311-cp311-win32.whl", hash = "sha256:791f409064d0a69dd20579345d852c59822c6aa087f23b07b1b4e28ff5880fcb"},
+    {file = "numpy-1.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:c40571fe966393b212689aa17e32ed905924120737194b5d5c1b20b9ed0fb171"},
+    {file = "numpy-1.25.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d7abcdd85aea3e6cdddb59af2350c7ab1ed764397f8eec97a038ad244d2d105"},
+    {file = "numpy-1.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a180429394f81c7933634ae49b37b472d343cccb5bb0c4a575ac8bbc433722f"},
+    {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d412c1697c3853c6fc3cb9751b4915859c7afe6a277c2bf00acf287d56c4e625"},
+    {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e1266411120a4f16fad8efa8e0454d21d00b8c7cee5b5ccad7565d95eb42dd"},
+    {file = "numpy-1.25.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f76aebc3358ade9eacf9bc2bb8ae589863a4f911611694103af05346637df1b7"},
+    {file = "numpy-1.25.1-cp39-cp39-win32.whl", hash = "sha256:247d3ffdd7775bdf191f848be8d49100495114c82c2bd134e8d5d075fb386a1c"},
+    {file = "numpy-1.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:1d5d3c68e443c90b38fdf8ef40e60e2538a27548b39b12b73132456847f4b631"},
+    {file = "numpy-1.25.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:35a9527c977b924042170a0887de727cd84ff179e478481404c5dc66b4170009"},
+    {file = "numpy-1.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d3fe3dd0506a28493d82dc3cf254be8cd0d26f4008a417385cbf1ae95b54004"},
+    {file = "numpy-1.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:012097b5b0d00a11070e8f2e261128c44157a8689f7dedcf35576e525893f4fe"},
+    {file = "numpy-1.25.1.tar.gz", hash = "sha256:9a3a9f3a61480cc086117b426a8bd86869c213fc4072e606f01c4e4b66eb92bf"},
+]
+
+[[package]]
+name = "oauth2client"
+version = "4.1.3"
+description = "OAuth 2.0 client library"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "oauth2client-4.1.3-py2.py3-none-any.whl", hash = "sha256:b8a81cc5d60e2d364f0b1b98f958dbd472887acaf1a5b05e21c28c31a2d6d3ac"},
+    {file = "oauth2client-4.1.3.tar.gz", hash = "sha256:d486741e451287f69568a4d26d70d9acd73a2bbfa275746c535b4209891cccc6"},
+]
+
+[package.dependencies]
+httplib2 = ">=0.9.1"
+pyasn1 = ">=0.1.7"
+pyasn1-modules = ">=0.0.5"
+rsa = ">=3.1.4"
+six = ">=1.6.1"
+
+[[package]]
+name = "oauthlib"
+version = "3.2.2"
+description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"},
+    {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
+]
+
+[package.extras]
+rsa = ["cryptography (>=3.0.0)"]
+signals = ["blinker (>=1.4.0)"]
+signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+
 [[package]]
 name = "openai"
 version = "0.27.8"
@@ -717,6 +863,73 @@ files = [
     {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
 ]
 
+[[package]]
+name = "pandas"
+version = "2.0.3"
+description = "Powerful data structures for data analysis, time series, and statistics"
+category = "dev"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"},
+    {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"},
+    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"},
+    {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"},
+    {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"},
+    {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"},
+    {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"},
+    {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"},
+    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"},
+    {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"},
+    {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"},
+    {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"},
+    {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"},
+    {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"},
+    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"},
+    {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"},
+    {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"},
+    {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"},
+    {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"},
+    {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"},
+    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"},
+    {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"},
+    {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"},
+    {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"},
+    {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
+    {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+]
+python-dateutil = ">=2.8.2"
+pytz = ">=2020.1"
+tzdata = ">=2022.1"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"]
+aws = ["s3fs (>=2021.08.0)"]
+clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"]
+compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"]
+computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"]
+feather = ["pyarrow (>=7.0.0)"]
+fss = ["fsspec (>=2021.07.0)"]
+gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"]
+hdf5 = ["tables (>=3.6.1)"]
+html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"]
+mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"]
+parquet = ["pyarrow (>=7.0.0)"]
+performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"]
+plot = ["matplotlib (>=3.6.1)"]
+postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"]
+spss = ["pyreadstat (>=1.1.2)"]
+sql-other = ["SQLAlchemy (>=1.4.16)"]
+test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.6.3)"]
+
 [[package]]
 name = "pathspec"
 version = "0.11.1"
@@ -746,14 +959,14 @@ ptyprocess = ">=0.5"
 
 [[package]]
 name = "platformdirs"
-version = "3.8.0"
+version = "3.9.1"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "platformdirs-3.8.0-py3-none-any.whl", hash = "sha256:ca9ed98ce73076ba72e092b23d3c93ea6c4e186b3f1c3dad6edd98ff6ffcca2e"},
-    {file = "platformdirs-3.8.0.tar.gz", hash = "sha256:b0cabcb11063d21a0b261d557acb0a9d2126350e63b70cdf7db6347baea456dc"},
+    {file = "platformdirs-3.9.1-py3-none-any.whl", hash = "sha256:ad8291ae0ae5072f66c16945166cb11c63394c7a3ad1b1bc9828ca3162da8c2f"},
+    {file = "platformdirs-3.9.1.tar.gz", hash = "sha256:1b42b450ad933e981d56e59f1b97495428c9bd60698baab9f3eb3d00d5822421"},
 ]
 
 [package.extras]
@@ -788,6 +1001,33 @@ files = [
     {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
 ]
 
+[[package]]
+name = "pyasn1"
+version = "0.5.0"
+description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+files = [
+    {file = "pyasn1-0.5.0-py2.py3-none-any.whl", hash = "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57"},
+    {file = "pyasn1-0.5.0.tar.gz", hash = "sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde"},
+]
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.3.0"
+description = "A collection of ASN.1-based protocols modules"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
+files = [
+    {file = "pyasn1_modules-0.3.0-py2.py3-none-any.whl", hash = "sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d"},
+    {file = "pyasn1_modules-0.3.0.tar.gz", hash = "sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.4.6,<0.6.0"
+
 [[package]]
 name = "pycodestyle"
 version = "2.7.0"
@@ -802,48 +1042,48 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "1.10.10"
+version = "1.10.11"
 description = "Data validation and settings management using python type hints"
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-1.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:adad1ee4ab9888f12dac2529276704e719efcf472e38df7813f5284db699b4ec"},
-    {file = "pydantic-1.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a7db03339893feef2092ff7b1afc9497beed15ebd4af84c3042a74abce02d48"},
-    {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b3714b97ff84b2689654851c2426389bcabfac9080617bcf4306c69db606f6"},
-    {file = "pydantic-1.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edfdf0a5abc5c9bf2052ebaec20e67abd52e92d257e4f2d30e02c354ed3e6030"},
-    {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20a3b30fd255eeeb63caa9483502ba96b7795ce5bf895c6a179b3d909d9f53a6"},
-    {file = "pydantic-1.10.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db4c7f7e60ca6f7d6c1785070f3e5771fcb9b2d88546e334d2f2c3934d949028"},
-    {file = "pydantic-1.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:a2d5be50ac4a0976817144c7d653e34df2f9436d15555189f5b6f61161d64183"},
-    {file = "pydantic-1.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:566a04ba755e8f701b074ffb134ddb4d429f75d5dced3fbd829a527aafe74c71"},
-    {file = "pydantic-1.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f79db3652ed743309f116ba863dae0c974a41b688242482638b892246b7db21d"},
-    {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c62376890b819bebe3c717a9ac841a532988372b7e600e76f75c9f7c128219d5"},
-    {file = "pydantic-1.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4870f13a4fafd5bc3e93cff3169222534fad867918b188e83ee0496452978437"},
-    {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:990027e77cda6072a566e433b6962ca3b96b4f3ae8bd54748e9d62a58284d9d7"},
-    {file = "pydantic-1.10.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8c40964596809eb616d94f9c7944511f620a1103d63d5510440ed2908fc410af"},
-    {file = "pydantic-1.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:ea9eebc2ebcba3717e77cdeee3f6203ffc0e78db5f7482c68b1293e8cc156e5e"},
-    {file = "pydantic-1.10.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:762aa598f79b4cac2f275d13336b2dd8662febee2a9c450a49a2ab3bec4b385f"},
-    {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dab5219659f95e357d98d70577b361383057fb4414cfdb587014a5f5c595f7b"},
-    {file = "pydantic-1.10.10-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3d4ee957a727ccb5a36f1b0a6dbd9fad5dedd2a41eada99a8df55c12896e18d"},
-    {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b69f9138dec566962ec65623c9d57bee44412d2fc71065a5f3ebb3820bdeee96"},
-    {file = "pydantic-1.10.10-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:7aa75d1bd9cc275cf9782f50f60cddaf74cbaae19b6ada2a28e737edac420312"},
-    {file = "pydantic-1.10.10-cp37-cp37m-win_amd64.whl", hash = "sha256:9f62a727f5c590c78c2d12fda302d1895141b767c6488fe623098f8792255fe5"},
-    {file = "pydantic-1.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aac218feb4af73db8417ca7518fb3bade4534fcca6e3fb00f84966811dd94450"},
-    {file = "pydantic-1.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:88546dc10a40b5b52cae87d64666787aeb2878f9a9b37825aedc2f362e7ae1da"},
-    {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c41bbaae89e32fc582448e71974de738c055aef5ab474fb25692981a08df808a"},
-    {file = "pydantic-1.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b71bd504d1573b0b722ae536e8ffb796bedeef978979d076bf206e77dcc55a5"},
-    {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e088e3865a2270ecbc369924cd7d9fbc565667d9158e7f304e4097ebb9cf98dd"},
-    {file = "pydantic-1.10.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3403a090db45d4027d2344859d86eb797484dfda0706cf87af79ace6a35274ef"},
-    {file = "pydantic-1.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:e0014e29637125f4997c174dd6167407162d7af0da73414a9340461ea8573252"},
-    {file = "pydantic-1.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9965e49c6905840e526e5429b09e4c154355b6ecc0a2f05492eda2928190311d"},
-    {file = "pydantic-1.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:748d10ab6089c5d196e1c8be9de48274f71457b01e59736f7a09c9dc34f51887"},
-    {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86936c383f7c38fd26d35107eb669c85d8f46dfceae873264d9bab46fe1c7dde"},
-    {file = "pydantic-1.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7a26841be620309a9697f5b1ffc47dce74909e350c5315ccdac7a853484d468a"},
-    {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:409b810f387610cc7405ab2fa6f62bdf7ea485311845a242ebc0bd0496e7e5ac"},
-    {file = "pydantic-1.10.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ce937a2a2c020bcad1c9fde02892392a1123de6dda906ddba62bfe8f3e5989a2"},
-    {file = "pydantic-1.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:37ebddef68370e6f26243acc94de56d291e01227a67b2ace26ea3543cf53dd5f"},
-    {file = "pydantic-1.10.10-py3-none-any.whl", hash = "sha256:a5939ec826f7faec434e2d406ff5e4eaf1716eb1f247d68cd3d0b3612f7b4c8a"},
-    {file = "pydantic-1.10.10.tar.gz", hash = "sha256:3b8d5bd97886f9eb59260594207c9f57dce14a6f869c6ceea90188715d29921a"},
+    {file = "pydantic-1.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ff44c5e89315b15ff1f7fdaf9853770b810936d6b01a7bcecaa227d2f8fe444f"},
+    {file = "pydantic-1.10.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a6c098d4ab5e2d5b3984d3cb2527e2d6099d3de85630c8934efcfdc348a9760e"},
+    {file = "pydantic-1.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16928fdc9cb273c6af00d9d5045434c39afba5f42325fb990add2c241402d151"},
+    {file = "pydantic-1.10.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0588788a9a85f3e5e9ebca14211a496409cb3deca5b6971ff37c556d581854e7"},
+    {file = "pydantic-1.10.11-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e9baf78b31da2dc3d3f346ef18e58ec5f12f5aaa17ac517e2ffd026a92a87588"},
+    {file = "pydantic-1.10.11-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:373c0840f5c2b5b1ccadd9286782852b901055998136287828731868027a724f"},
+    {file = "pydantic-1.10.11-cp310-cp310-win_amd64.whl", hash = "sha256:c3339a46bbe6013ef7bdd2844679bfe500347ac5742cd4019a88312aa58a9847"},
+    {file = "pydantic-1.10.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:08a6c32e1c3809fbc49debb96bf833164f3438b3696abf0fbeceb417d123e6eb"},
+    {file = "pydantic-1.10.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a451ccab49971af043ec4e0d207cbc8cbe53dbf148ef9f19599024076fe9c25b"},
+    {file = "pydantic-1.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b02d24f7b2b365fed586ed73582c20f353a4c50e4be9ba2c57ab96f8091ddae"},
+    {file = "pydantic-1.10.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f34739a89260dfa420aa3cbd069fbcc794b25bbe5c0a214f8fb29e363484b66"},
+    {file = "pydantic-1.10.11-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e297897eb4bebde985f72a46a7552a7556a3dd11e7f76acda0c1093e3dbcf216"},
+    {file = "pydantic-1.10.11-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d185819a7a059550ecb85d5134e7d40f2565f3dd94cfd870132c5f91a89cf58c"},
+    {file = "pydantic-1.10.11-cp311-cp311-win_amd64.whl", hash = "sha256:4400015f15c9b464c9db2d5d951b6a780102cfa5870f2c036d37c23b56f7fc1b"},
+    {file = "pydantic-1.10.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2417de68290434461a266271fc57274a138510dca19982336639484c73a07af6"},
+    {file = "pydantic-1.10.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:331c031ba1554b974c98679bd0780d89670d6fd6f53f5d70b10bdc9addee1713"},
+    {file = "pydantic-1.10.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8268a735a14c308923e8958363e3a3404f6834bb98c11f5ab43251a4e410170c"},
+    {file = "pydantic-1.10.11-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:44e51ba599c3ef227e168424e220cd3e544288c57829520dc90ea9cb190c3248"},
+    {file = "pydantic-1.10.11-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d7781f1d13b19700b7949c5a639c764a077cbbdd4322ed505b449d3ca8edcb36"},
+    {file = "pydantic-1.10.11-cp37-cp37m-win_amd64.whl", hash = "sha256:7522a7666157aa22b812ce14c827574ddccc94f361237ca6ea8bb0d5c38f1629"},
+    {file = "pydantic-1.10.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc64eab9b19cd794a380179ac0e6752335e9555d214cfcb755820333c0784cb3"},
+    {file = "pydantic-1.10.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8dc77064471780262b6a68fe67e013298d130414d5aaf9b562c33987dbd2cf4f"},
+    {file = "pydantic-1.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe429898f2c9dd209bd0632a606bddc06f8bce081bbd03d1c775a45886e2c1cb"},
+    {file = "pydantic-1.10.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:192c608ad002a748e4a0bed2ddbcd98f9b56df50a7c24d9a931a8c5dd053bd3d"},
+    {file = "pydantic-1.10.11-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef55392ec4bb5721f4ded1096241e4b7151ba6d50a50a80a2526c854f42e6a2f"},
+    {file = "pydantic-1.10.11-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e0bb6efe86281623abbeeb0be64eab740c865388ee934cd3e6a358784aca6e"},
+    {file = "pydantic-1.10.11-cp38-cp38-win_amd64.whl", hash = "sha256:265a60da42f9f27e0b1014eab8acd3e53bd0bad5c5b4884e98a55f8f596b2c19"},
+    {file = "pydantic-1.10.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:469adf96c8e2c2bbfa655fc7735a2a82f4c543d9fee97bd113a7fb509bf5e622"},
+    {file = "pydantic-1.10.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e6cbfbd010b14c8a905a7b10f9fe090068d1744d46f9e0c021db28daeb8b6de1"},
+    {file = "pydantic-1.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abade85268cc92dff86d6effcd917893130f0ff516f3d637f50dadc22ae93999"},
+    {file = "pydantic-1.10.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9738b0f2e6c70f44ee0de53f2089d6002b10c33264abee07bdb5c7f03038303"},
+    {file = "pydantic-1.10.11-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:787cf23e5a0cde753f2eabac1b2e73ae3844eb873fd1f5bdbff3048d8dbb7604"},
+    {file = "pydantic-1.10.11-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:174899023337b9fc685ac8adaa7b047050616136ccd30e9070627c1aaab53a13"},
+    {file = "pydantic-1.10.11-cp39-cp39-win_amd64.whl", hash = "sha256:1954f8778489a04b245a1e7b8b22a9d3ea8ef49337285693cf6959e4b757535e"},
+    {file = "pydantic-1.10.11-py3-none-any.whl", hash = "sha256:008c5e266c8aada206d0627a011504e14268a62091450210eda7c07fabe6963e"},
+    {file = "pydantic-1.10.11.tar.gz", hash = "sha256:f66d479cf7eb331372c470614be6511eae96f1f120344c25f3f9bb59fb1b5528"},
 ]
 
 [package.dependencies]
@@ -865,6 +1105,21 @@ files = [
     {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"},
 ]
 
+[[package]]
+name = "pyparsing"
+version = "3.1.0"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+category = "dev"
+optional = false
+python-versions = ">=3.6.8"
+files = [
+    {file = "pyparsing-3.1.0-py3-none-any.whl", hash = "sha256:d554a96d1a7d3ddaf7183104485bc19fd80543ad6ac5bdb6426719d766fb06c1"},
+    {file = "pyparsing-3.1.0.tar.gz", hash = "sha256:edb662d6fe322d6e990b1594b5feaeadf806803359e3d4d42f11e295e588f0ea"},
+]
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
 [[package]]
 name = "pytest"
 version = "7.4.0"
@@ -906,6 +1161,21 @@ future-fstrings = "*"
 networkx = "*"
 pytest = ">=3"
 
+[[package]]
+name = "python-dateutil"
+version = "2.8.2"
+description = "Extensions to the standard Python datetime module"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+files = [
+    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
+    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+]
+
+[package.dependencies]
+six = ">=1.5"
+
 [[package]]
 name = "python-dotenv"
 version = "0.21.1"
@@ -921,6 +1191,18 @@ files = [
 [package.extras]
 cli = ["click (>=5.0)"]
 
+[[package]]
+name = "pytz"
+version = "2023.3"
+description = "World timezone definitions, modern and historical"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"},
+    {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
+]
+
 [[package]]
 name = "requests"
 version = "2.31.0"
@@ -943,6 +1225,52 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "requests-oauthlib"
+version = "1.3.1"
+description = "OAuthlib authentication support for Requests."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"},
+    {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"},
+]
+
+[package.dependencies]
+oauthlib = ">=3.0.0"
+requests = ">=2.0.0"
+
+[package.extras]
+rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
+
+[[package]]
+name = "rsa"
+version = "4.9"
+description = "Pure-Python RSA implementation"
+category = "dev"
+optional = false
+python-versions = ">=3.6,<4"
+files = [
+    {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"},
+    {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"},
+]
+
+[package.dependencies]
+pyasn1 = ">=0.1.3"
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+
 [[package]]
 name = "toml"
 version = "0.10.2"
@@ -1027,23 +1355,34 @@ files = [
     {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
 ]
 
+[[package]]
+name = "tzdata"
+version = "2023.3"
+description = "Provider of IANA time zone data"
+category = "dev"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"},
+    {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"},
+]
+
 [[package]]
 name = "urllib3"
-version = "2.0.3"
+version = "1.26.16"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 category = "main"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 files = [
-    {file = "urllib3-2.0.3-py3-none-any.whl", hash = "sha256:48e7fafa40319d358848e1bc6809b208340fafe2096f1725d05d67443d0483d1"},
-    {file = "urllib3-2.0.3.tar.gz", hash = "sha256:bee28b5e56addb8226c96f7f13ac28cb4c301dd5ea8a6ca179c0b9835e032825"},
+    {file = "urllib3-1.26.16-py2.py3-none-any.whl", hash = "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f"},
+    {file = "urllib3-1.26.16.tar.gz", hash = "sha256:8f135f6502756bde6b2a9b28989df5fbe87c9970cecaa69041edcce7f0589b14"},
 ]
 
 [package.extras]
-brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
-socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
-zstd = ["zstandard (>=0.18.0)"]
+brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"]
+secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
+socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
 [[package]]
 name = "yarl"
@@ -1136,4 +1475,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "09871e879785f0a7d5c31a61553cd2df08d88324a864b9c56b8e97d95893157f"
+content-hash = "4dbf4bdf1965f80ad6ae3c27c6dab58e9ccf2e0fd154c8380e2df9e30455ffcd"
diff --git a/pyproject.toml b/pyproject.toml
index 48be9cf5d..cf0504d62 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,9 @@ mypy = "^0.910"
 isort = "^5.9.3"
 black = "22.3"
 autoflake = "^1.4"
+pandas = "^2.0.3"
+gspread = "^5.10.0"
+oauth2client = "^4.1.3"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/send_to_googledrive.py b/send_to_googledrive.py
new file mode 100644
index 000000000..aa074ea6a
--- /dev/null
+++ b/send_to_googledrive.py
@@ -0,0 +1,112 @@
+import base64
+import json
+import os
+
+import gspread
+import pandas as pd
+from dotenv import load_dotenv
+from oauth2client.service_account import ServiceAccountCredentials
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Get the base64 string from the environment variable
+base64_creds = os.getenv("GDRIVE_BASE64")
+
+if base64_creds is None:
+    raise ValueError("The GDRIVE_BASE64 environment variable is not set")
+
+# Decode the base64 string into bytes
+creds_bytes = base64.b64decode(base64_creds)
+
+# Convert the bytes into a string
+creds_string = creds_bytes.decode("utf-8")
+
+# Parse the string into a JSON object
+creds_info = json.loads(creds_string)
+
+# Define the base directory containing JSON files
+base_dir = "reports"
+
+# Create a list to store each row of data
+rows = []
+
+# Loop over each directory in the base directory
+for sub_dir in os.listdir(base_dir):
+    # Define the subdirectory path
+    sub_dir_path = os.path.join(base_dir, sub_dir)
+
+    # Ensure the sub_dir_path is a directory
+    if os.path.isdir(sub_dir_path):
+        # Loop over each file in the subdirectory
+        for filename in os.listdir(sub_dir_path):
+            # Check if the file is a JSON file
+            if filename.endswith(".json"):
+                # Define the file path
+                file_path = os.path.join(sub_dir_path, filename)
+
+                # Load the JSON data from the file
+                with open(file_path, "r") as f:
+                    data = json.load(f)
+
+                # Loop through each test
+                for test_name, test_info in data["tests"].items():
+                    # Create a dictionary to hold the information for this row
+                    row = {
+                        "Agent": sub_dir,
+                        "Command": data.get("command", ""),
+                        "Completion Time": data.get("completion_time", ""),
+                        "Total Run Time": data.get("metrics", {}).get("run_time", ""),
+                        "Highest Difficulty": data.get("metrics", {}).get(
+                            "highest_difficulty", ""
+                        ),
+                        "Workspace": data.get("config", {}).get("workspace", ""),
+                        "Test Name": test_name,
+                        "Data Path": test_info.get("data_path", ""),
+                        "Is Regression": test_info.get("is_regression", ""),
+                        "Difficulty": test_info.get("metrics", {}).get(
+                            "difficulty", ""
+                        ),
+                        "Success": test_info.get("metrics", {}).get("success", ""),
+                        "Success %": test_info.get("metrics", {}).get("success_%", ""),
+                        "Non mock success %": test_info.get("metrics", {}).get(
+                            "non_mock_success_%", ""
+                        ),
+                        "Run Time": test_info.get("metrics", {}).get("run_time", ""),
+                    }
+
+                    # Add this row to the list
+                    rows.append(row)
+
+# Convert the list of rows into a DataFrame
+df = pd.DataFrame(rows)
+
+# Define the scope
+scope = [
+    "https://spreadsheets.google.com/feeds",
+    "https://www.googleapis.com/auth/drive",
+]
+
+# Add your service account credentials
+creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_info, scope)
+
+# Authorize the clientsheet
+client = gspread.authorize(creds)
+
+# Get the instance of the Spreadsheet
+sheet = client.open("benchmark")
+
+# Get the first sheet of the Spreadsheet
+sheet_instance = sheet.get_worksheet(0)
+
+# Convert dataframe to list of lists for uploading to Google Sheets
+values = df.values.tolist()
+
+# Prepend the header to the values list
+values.insert(0, df.columns.tolist())
+
+# Clear the existing values in the worksheet
+sheet_instance.clear()
+
+# Update the worksheet with the new values
+sheet_instance.append_rows(values)
-- 
cgit v1.2.3


From 953060335d028c113f8180dcbf28362fda8b166d Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 16:20:12 +0000
Subject: smol-developer-20230718162012

---
 reports/smol-developer/file4_07-18-16-19.json | 266 ++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 reports/smol-developer/file4_07-18-16-19.json

diff --git a/reports/smol-developer/file4_07-18-16-19.json b/reports/smol-developer/file4_07-18-16-19.json
new file mode 100644
index 000000000..43fd780f8
--- /dev/null
+++ b/reports/smol-developer/file4_07-18-16-19.json
@@ -0,0 +1,266 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-16:20",
+    "metrics": {
+        "run_time": "31.71 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "5.187 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "7.488 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "11.614 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "6.074 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.164 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 52e7f92e54f2276b40e83a8aeb6a1da3a9ab2aab Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 16:20:39 +0000
Subject: Auto-GPT-20230718162039

---
 reports/Auto-GPT/file4_07-18-16-20.json | 268 ++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 reports/Auto-GPT/file4_07-18-16-20.json

diff --git a/reports/Auto-GPT/file4_07-18-16-20.json b/reports/Auto-GPT/file4_07-18-16-20.json
new file mode 100644
index 000000000..f7d6d7cb6
--- /dev/null
+++ b/reports/Auto-GPT/file4_07-18-16-20.json
@@ -0,0 +1,268 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-16:20",
+    "metrics": {
+        "run_time": "21.6 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "21.346 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From ef1ded34602d372b3134fa8026cb3cb0dd1f49e5 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 16:21:54 +0000
Subject: gpt-engineer-20230718162154

---
 reports/gpt-engineer/file4_07-18-16-19.json | 267 ++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 reports/gpt-engineer/file4_07-18-16-19.json

diff --git a/reports/gpt-engineer/file4_07-18-16-19.json b/reports/gpt-engineer/file4_07-18-16-19.json
new file mode 100644
index 000000000..2fdba0555
--- /dev/null
+++ b/reports/gpt-engineer/file4_07-18-16-19.json
@@ -0,0 +1,267 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-16:21",
+    "metrics": {
+        "run_time": "124.12 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "68.605 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'",
+                "success_%": 0.0,
+                "run_time": "53.647 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "0.772 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
+                "success_%": 0.0,
+                "run_time": "0.811 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From fb1c34ec9f2e0078e53c856e4046ee3658c5f4c6 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Tue, 18 Jul 2023 16:27:05 +0000
Subject: mini-agi-20230718162705

---
 reports/mini-agi/file2_07-18-16-20.json | 260 ++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 reports/mini-agi/file2_07-18-16-20.json

diff --git a/reports/mini-agi/file2_07-18-16-20.json b/reports/mini-agi/file2_07-18-16-20.json
new file mode 100644
index 000000000..e8cde442d
--- /dev/null
+++ b/reports/mini-agi/file2_07-18-16-20.json
@@ -0,0 +1,260 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-18-16:26",
+    "metrics": {
+        "run_time": "400.95 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": true,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "15.813 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "32.591 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": true,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "22.01 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": true,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "25.419 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "59.541 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": true,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "25.535 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 20.0,
+                "run_time": "49.13 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "[Errno 2] No such file or directory: '/home/runner/miniagi/result.txt'",
+                "success_%": 66.67,
+                "run_time": "57.587 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 66.67,
+                "run_time": "22.668 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "11.021 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "53.932 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 25.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "25.51 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 33.33,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/mini-agi/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "${os.path.join(Path.home(), 'miniagi')}"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 0c94bb5f2510661762e9406b8b5bce094d6249c0 Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <github@pwuts.nl>
Date: Tue, 18 Jul 2023 22:34:52 +0200
Subject: Fix configuring TTS engine (#5005)

---
 autogpt/config/config.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/autogpt/config/config.py b/autogpt/config/config.py
index cb3f26d3e..b6773511d 100644
--- a/autogpt/config/config.py
+++ b/autogpt/config/config.py
@@ -277,16 +277,16 @@ class ConfigBuilder(Configurable[Config]):
         config_dict["elevenlabs_voice_id"] = os.getenv(
             "ELEVENLABS_VOICE_ID", os.getenv("ELEVENLABS_VOICE_1_ID")
         )
-        elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
-        if os.getenv("USE_MAC_OS_TTS"):
-            default_tts_provider = "macos"
-        elif elevenlabs_api_key:
-            default_tts_provider = "elevenlabs"
-        elif os.getenv("USE_BRIAN_TTS"):
-            default_tts_provider = "streamelements"
-        else:
-            default_tts_provider = "gtts"
-        config_dict["text_to_speech_provider"] = default_tts_provider
+        if not config_dict["text_to_speech_provider"]:
+            if os.getenv("USE_MAC_OS_TTS"):
+                default_tts_provider = "macos"
+            elif config_dict["elevenlabs_api_key"]:
+                default_tts_provider = "elevenlabs"
+            elif os.getenv("USE_BRIAN_TTS"):
+                default_tts_provider = "streamelements"
+            else:
+                default_tts_provider = "gtts"
+            config_dict["text_to_speech_provider"] = default_tts_provider
 
         config_dict["plugins_allowlist"] = _safe_split(os.getenv("ALLOWLISTED_PLUGINS"))
         config_dict["plugins_denylist"] = _safe_split(os.getenv("DENYLISTED_PLUGINS"))
-- 
cgit v1.2.3


From 5fe95adc069c89d5d2376a6835ae13ed1e743465 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 08:18:54 +0000
Subject: smol-developer-20230719081854

---
 reports/smol-developer/file5_07-19-08-18.json | 266 ++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 reports/smol-developer/file5_07-19-08-18.json

diff --git a/reports/smol-developer/file5_07-19-08-18.json b/reports/smol-developer/file5_07-19-08-18.json
new file mode 100644
index 000000000..8e37c7651
--- /dev/null
+++ b/reports/smol-developer/file5_07-19-08-18.json
@@ -0,0 +1,266 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-08:18",
+    "metrics": {
+        "run_time": "40.84 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "7.054 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "9.593 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "14.527 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "7.886 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.513 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 34309a6c00e1d1c42b99944138cf78fff26f13de Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 08:19:09 +0000
Subject: gpt-engineer-20230719081909

---
 reports/gpt-engineer/file5_07-19-08-18.json | 268 ++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 reports/gpt-engineer/file5_07-19-08-18.json

diff --git a/reports/gpt-engineer/file5_07-19-08-18.json b/reports/gpt-engineer/file5_07-19-08-18.json
new file mode 100644
index 000000000..68fe4d03e
--- /dev/null
+++ b/reports/gpt-engineer/file5_07-19-08-18.json
@@ -0,0 +1,268 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-08:19",
+    "metrics": {
+        "run_time": "49.52 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "49.338 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0], agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From f475631cddd22d0d5591c7dea40ba64c0fd3576d Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 08:22:04 +0000
Subject: Auto-GPT-20230719082204

---
 reports/Auto-GPT/file5_07-19-08-18.json | 267 ++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 reports/Auto-GPT/file5_07-19-08-18.json

diff --git a/reports/Auto-GPT/file5_07-19-08-18.json b/reports/Auto-GPT/file5_07-19-08-18.json
new file mode 100644
index 000000000..25761cc08
--- /dev/null
+++ b/reports/Auto-GPT/file5_07-19-08-18.json
@@ -0,0 +1,267 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-08:21",
+    "metrics": {
+        "run_time": "219.63 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "42.055 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "90.246 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "26.804 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "60.304 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From dcdc0c97274586d599006ae02e969354eb9882b4 Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 19 Jul 2023 13:37:29 -0700
Subject: Integrate Beebot (#169)

---
 .github/workflows/ci.yml | 31 +++++++++++++++++++------------
 .gitmodules              |  4 ++++
 agent/beebot             |  1 +
 3 files changed, 24 insertions(+), 12 deletions(-)
 create mode 160000 agent/beebot

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e34b2e864..261800450 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -71,7 +71,6 @@ jobs:
       min-python-version: "3.10"
     name: "${{ matrix.agent-name }} (Cache: ${{ matrix.cache-enabled }})"
     runs-on: ubuntu-latest
-    timeout-minutes: 10
     strategy:
       fail-fast: false
       matrix:
@@ -80,6 +79,7 @@ jobs:
           - "smol-developer"
           - "Auto-GPT"
           - "mini-agi"
+          - "beebot"
         cache-enabled: [ true, false ]
 
     steps:
@@ -115,6 +115,7 @@ jobs:
       - name: Run regression tests
         run: |
           cd agent/$AGENT_NAME
+          prefix=""
           if [ "$AGENT_NAME" == "gpt-engineer" ]; then
             make install
             source venv/bin/activate
@@ -135,6 +136,10 @@ jobs:
             cp config_template.yaml config.yaml
             sed -i 's/OPENAI_API_KEY:.*/OPENAI_API_KEY: "'"${{ secrets.OPENAI_API_KEY }}"'"/' config.yaml
             docker-compose up -d --build
+          elif [ "$AGENT_NAME" == "beebot" ]; then
+            poetry install
+            poetry run playwright install
+            prefix="poetry run "
           else
             echo "Unknown agent name: $AGENT_NAME"
             exit 1
@@ -143,19 +148,19 @@ jobs:
           pip install ../../dist/*.whl
           
           if [ "${GITHUB_EVENT_NAME}" == "pull_request" ]; then
-            agbenchmark start --maintain --mock
-            agbenchmark start --improve --mock
-            agbenchmark start --mock
-            agbenchmark start --mock --category=retrieval
-            agbenchmark start --mock --category=interface
-            agbenchmark start --mock --category=code
-            agbenchmark start --mock --category=memory
-            agbenchmark start --mock --category=iterate
+            ${prefix}agbenchmark start --maintain --mock
+            ${prefix}agbenchmark start --improve --mock
+            ${prefix}agbenchmark start --mock
+            ${prefix}agbenchmark start --mock --category=retrieval
+            ${prefix}agbenchmark start --mock --category=interface
+            ${prefix}agbenchmark start --mock --category=code
+            ${prefix}agbenchmark start --mock --category=memory
+            ${prefix}agbenchmark start --mock --category=iterate
           else
             curl -s https://raw.githubusercontent.com/Helicone/helicone/main/mitmproxy.sh | bash -s start
-            agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved."
+            ${prefix}agbenchmark start || echo "This command will always return a non zero exit code unless all the challenges are solved."
           fi
-
+          
           cd ../..
 
         env:
@@ -191,6 +196,8 @@ jobs:
           git fetch origin $current_branch
           git rebase origin/$current_branch
           git push origin HEAD
-          poetry run python send_to_googledrive.py
+          if [ "$current_branch" == "master" ]; then
+            poetry run python send_to_googledrive.py
+          fi
         env:
           GDRIVE_BASE64: ${{ secrets.GDRIVE_BASE64 }}
diff --git a/.gitmodules b/.gitmodules
index d2b71f9c4..9fefe0c06 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -18,3 +18,7 @@
 	path = agent/SuperAGI
 	url = https://github.com/SilenNaihin/SuperAGI.git
 	branch = benchmark-integration
+[submodule "agent/beebot"]
+	path = agent/beebot
+	url = https://github.com/merwanehamadi/beebot.git
+	branch = benchmark-integration
diff --git a/agent/beebot b/agent/beebot
new file mode 160000
index 000000000..b9686b12d
--- /dev/null
+++ b/agent/beebot
@@ -0,0 +1 @@
+Subproject commit b9686b12d317b26095d706665f0a43244d7afb7c
-- 
cgit v1.2.3


From 2fcf5352b233618651bfdbad260e063cd662f14a Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 20:40:10 +0000
Subject: smol-developer-20230719204010

---
 reports/smol-developer/file6_07-19-20-39.json | 266 ++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 reports/smol-developer/file6_07-19-20-39.json

diff --git a/reports/smol-developer/file6_07-19-20-39.json b/reports/smol-developer/file6_07-19-20-39.json
new file mode 100644
index 000000000..1a56f98f9
--- /dev/null
+++ b/reports/smol-developer/file6_07-19-20-39.json
@@ -0,0 +1,266 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-20:40",
+    "metrics": {
+        "run_time": "38.68 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "5.156 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "7.535 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "14.961 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "9.584 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.246 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 147425837c54ce67453a9812acb5f8e615958489 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 20:40:50 +0000
Subject: Auto-GPT-20230719204050

---
 reports/Auto-GPT/file6_07-19-20-40.json | 268 ++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 reports/Auto-GPT/file6_07-19-20-40.json

diff --git a/reports/Auto-GPT/file6_07-19-20-40.json b/reports/Auto-GPT/file6_07-19-20-40.json
new file mode 100644
index 000000000..715d2a276
--- /dev/null
+++ b/reports/Auto-GPT/file6_07-19-20-40.json
@@ -0,0 +1,268 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-20:40",
+    "metrics": {
+        "run_time": "23.24 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "22.992 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestWriteFile::test_method[challenge_data0], venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From ef684baa441472cb4bca99d16319cac17ba9b8e2 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 20:41:24 +0000
Subject: beebot-20230719204124

---
 reports/beebot/file1_07-19-20-40.json | 268 ++++++++++++++++++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 reports/beebot/file1_07-19-20-40.json

diff --git a/reports/beebot/file1_07-19-20-40.json b/reports/beebot/file1_07-19-20-40.json
new file mode 100644
index 000000000..1f728f85a
--- /dev/null
+++ b/reports/beebot/file1_07-19-20-40.json
@@ -0,0 +1,268 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-20:41",
+    "metrics": {
+        "run_time": "46.24 seconds",
+        "highest_difficulty": "No successful tests"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "46.006 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.003 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From df5c66d8a7642fdc473d878fc4dcf11e353f31da Mon Sep 17 00:00:00 2001
From: merwanehamadi <merwanehamadi@gmail.com>
Date: Wed, 19 Jul 2023 14:53:42 -0700
Subject: Change beebot submodule (#170)

---
 .gitmodules                           |   2 +-
 agent/beebot                          |   2 +-
 reports/beebot/file1_07-19-20-40.json | 268 ----------------------------------
 3 files changed, 2 insertions(+), 270 deletions(-)
 delete mode 100644 reports/beebot/file1_07-19-20-40.json

diff --git a/.gitmodules b/.gitmodules
index 9fefe0c06..389314b1d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -21,4 +21,4 @@
 [submodule "agent/beebot"]
 	path = agent/beebot
 	url = https://github.com/merwanehamadi/beebot.git
-	branch = benchmark-integration
+	branch = master
diff --git a/agent/beebot b/agent/beebot
index b9686b12d..2e9291d93 160000
--- a/agent/beebot
+++ b/agent/beebot
@@ -1 +1 @@
-Subproject commit b9686b12d317b26095d706665f0a43244d7afb7c
+Subproject commit 2e9291d93080890884bf63ae19d3533a960b69a3
diff --git a/reports/beebot/file1_07-19-20-40.json b/reports/beebot/file1_07-19-20-40.json
deleted file mode 100644
index 1f728f85a..000000000
--- a/reports/beebot/file1_07-19-20-40.json
+++ /dev/null
@@ -1,268 +0,0 @@
-{
-    "command": "agbenchmark start",
-    "completion_time": "2023-07-19-20:41",
-    "metrics": {
-        "run_time": "46.24 seconds",
-        "highest_difficulty": "No successful tests"
-    },
-    "tests": {
-        "TestWriteFile": {
-            "data_path": "agbenchmark/challenges/interface/write_file",
-            "is_regression": false,
-            "task": "Print the the capital of America to a .txt file",
-            "answer": "Washington",
-            "description": "Tests the writing to file",
-            "metrics": {
-                "difficulty": "interface",
-                "success": false,
-                "fail_reason": "assert 1 in [0.0]",
-                "success_%": 0.0,
-                "run_time": "46.006 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestBasicCodeGeneration": {
-            "data_path": "agbenchmark/challenges/code/d3_two_sum",
-            "is_regression": false,
-            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
-            "answer": "The two_sum function coded properly.",
-            "description": "Tests ability for the agent to create the two_sum function.",
-            "metrics": {
-                "difficulty": "advanced",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.003 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestSearch": {
-            "data_path": "agbenchmark/challenges/interface/search",
-            "is_regression": false,
-            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
-            "answer": "This is a Heading\nThis is a paragraph.",
-            "description": "Tests if an llm can search",
-            "metrics": {
-                "difficulty": "interface",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestSearch::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.001 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestReadFile": {
-            "data_path": "agbenchmark/challenges/interface/read_file",
-            "is_regression": false,
-            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
-            "answer": "random string Hello World!",
-            "description": "This reads the file quickly",
-            "metrics": {
-                "difficulty": "interface",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestReadFile::test_method[challenge_data0] depends on challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.001 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestThreeSum": {
-            "data_path": "agbenchmark/challenges/code/d5_three_sum",
-            "is_regression": false,
-            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
-            "answer": "The three_sum function coded properly.",
-            "description": "Tests ability for the agent to create the three_sum function.",
-            "metrics": {
-                "difficulty": "advanced",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestBasicRetrieval": {
-            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
-            "is_regression": false,
-            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
-            "answer": "\u00a325.89",
-            "description": "Specifies specific website to retrieve website from.",
-            "metrics": {
-                "difficulty": "basic",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestDebugSimpleTypoWithGuidance": {
-            "data_path": "agbenchmark/challenges/code/d1_debug",
-            "is_regression": false,
-            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
-            "answer": "[0, 1] [2, 5] [0, 3]",
-            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
-            "metrics": {
-                "difficulty": "basic",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestBasicMemory": {
-            "data_path": "agbenchmark/challenges/memory/m1_id",
-            "is_regression": false,
-            "task": "Follow the instructions in the instructions_1.txt file",
-            "answer": "2314",
-            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
-            "metrics": {
-                "difficulty": "basic",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on challenges/test_all.py::TestReadFile::test_method[challenge_data0], challenges/test_all.py::TestWriteFile::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestRetrieval2.0": {
-            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
-            "is_regression": false,
-            "task": "Write tesla's revenue in 2022 into a .txt file.",
-            "answer": "It was $81.462 billion in 2022.",
-            "description": "A no guardrails search for info",
-            "metrics": {
-                "difficulty": "novice",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestDebugSimpleTypoWithoutGuidance": {
-            "data_path": "agbenchmark/challenges/code/d2_vague",
-            "is_regression": false,
-            "task": "Make test.py run without errors.",
-            "answer": "[0, 1] [2, 5] [0, 3]",
-            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
-            "metrics": {
-                "difficulty": "novice",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestCreateSimpleWebServer": {
-            "data_path": "agbenchmark/challenges/code/d4_web_server",
-            "is_regression": false,
-            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
-            "answer": "GET localhost:8079/health responds with a 200 OK",
-            "description": "Tests ability for the agent to build a simple web server locally",
-            "metrics": {
-                "difficulty": "advanced",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.001 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestRememberMultipleIds": {
-            "data_path": "agbenchmark/challenges/memory/m2_multiple",
-            "is_regression": false,
-            "task": "Follow the instructions in the instructions_1.txt file",
-            "answer": "3145\n3791\n9317\n9471",
-            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
-            "metrics": {
-                "difficulty": "novice",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.001 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestRetrieval2.1": {
-            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
-            "is_regression": false,
-            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
-            "answer": "It was $81.462 billion in 2022.",
-            "description": "This one checks the accuracy of the information over r2",
-            "metrics": {
-                "difficulty": "novice",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.001 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestRememberMultipleWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m3_noise",
-            "is_regression": false,
-            "task": "Follow the instructions in the instructions_1.txt file",
-            "answer": "3145\n3791\n9317\n9471",
-            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestRetrieval3": {
-            "data_path": "agbenchmark/challenges/retrieval/r3",
-            "is_regression": false,
-            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
-            "description": "Tests ability to retrieve information.",
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestRetrieval2.2": {
-            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
-            "is_regression": false,
-            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
-            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
-            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
-            "metrics": {
-                "difficulty": "intermediate",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.002 seconds"
-            },
-            "reached_cutoff": false
-        },
-        "TestRememberMultiplePhrasesWithNoise": {
-            "data_path": "agbenchmark/challenges/memory/m4_phrases",
-            "is_regression": false,
-            "task": "Follow the instructions in the instructions_1.txt file",
-            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
-            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
-            "metrics": {
-                "difficulty": "advanced",
-                "success": false,
-                "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
-                "success_%": 0.0,
-                "run_time": "0.001 seconds"
-            },
-            "reached_cutoff": false
-        }
-    },
-    "config": {
-        "workspace": "workspace"
-    }
-}
\ No newline at end of file
-- 
cgit v1.2.3


From aec0e2fe7af62a0bac2921e108d2ec3ee2e8b8dd Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 21:56:12 +0000
Subject: smol-developer-20230719215612

---
 reports/smol-developer/file7_07-19-21-55.json | 266 ++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 reports/smol-developer/file7_07-19-21-55.json

diff --git a/reports/smol-developer/file7_07-19-21-55.json b/reports/smol-developer/file7_07-19-21-55.json
new file mode 100644
index 000000000..0ed5b94cd
--- /dev/null
+++ b/reports/smol-developer/file7_07-19-21-55.json
@@ -0,0 +1,266 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-21:56",
+    "metrics": {
+        "run_time": "35.04 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "4.839 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "7.157 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "15.171 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "6.181 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.503 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From d14ccd71cdc1eada0675766a8982e430b7caf9a7 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 21:59:39 +0000
Subject: Auto-GPT-20230719215939

---
 reports/Auto-GPT/file7_07-19-21-56.json | 267 ++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 reports/Auto-GPT/file7_07-19-21-56.json

diff --git a/reports/Auto-GPT/file7_07-19-21-56.json b/reports/Auto-GPT/file7_07-19-21-56.json
new file mode 100644
index 000000000..636cb642f
--- /dev/null
+++ b/reports/Auto-GPT/file7_07-19-21-56.json
@@ -0,0 +1,267 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-21:59",
+    "metrics": {
+        "run_time": "169.14 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "49.739 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "16.504 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "26.102 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "76.482 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "autogpt/workspace/auto_gpt_workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 71ff6f1b8ce2a860f2b97f85726b69aac365170a Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 22:02:29 +0000
Subject: gpt-engineer-20230719220229

---
 reports/gpt-engineer/file6_07-19-21-55.json | 264 ++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 reports/gpt-engineer/file6_07-19-21-55.json

diff --git a/reports/gpt-engineer/file6_07-19-21-55.json b/reports/gpt-engineer/file6_07-19-21-55.json
new file mode 100644
index 000000000..aa91aeaad
--- /dev/null
+++ b/reports/gpt-engineer/file6_07-19-21-55.json
@@ -0,0 +1,264 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-22:02",
+    "metrics": {
+        "run_time": "403.03 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "64.853 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "70.097 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "74.87 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
+                "success_%": 0.0,
+                "run_time": "65.049 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "85.607 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "42.365 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 204af6597db9eb47b3252b252470559853939cbe Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Wed, 19 Jul 2023 22:04:21 +0000
Subject: beebot-20230719220421

---
 reports/beebot/file1_07-19-21-56.json | 263 ++++++++++++++++++++++++++++++++++
 1 file changed, 263 insertions(+)
 create mode 100644 reports/beebot/file1_07-19-21-56.json

diff --git a/reports/beebot/file1_07-19-21-56.json b/reports/beebot/file1_07-19-21-56.json
new file mode 100644
index 000000000..78ea838a8
--- /dev/null
+++ b/reports/beebot/file1_07-19-21-56.json
@@ -0,0 +1,263 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-19-22:04",
+    "metrics": {
+        "run_time": "494.94 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "15.637 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "94.174 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "20.195 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "25.044 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "64.425 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "75.682 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "60.342 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.007 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "61.089 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/beebot/workspace/result.txt'",
+                "success_%": 0.0,
+                "run_time": "78.158 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 622e0a2d62e0cdee24bad71e844f3891cde60331 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Thu, 20 Jul 2023 08:19:09 +0000
Subject: smol-developer-20230720081909

---
 reports/smol-developer/file8_07-20-08-18.json | 266 ++++++++++++++++++++++++++
 1 file changed, 266 insertions(+)
 create mode 100644 reports/smol-developer/file8_07-20-08-18.json

diff --git a/reports/smol-developer/file8_07-20-08-18.json b/reports/smol-developer/file8_07-20-08-18.json
new file mode 100644
index 000000000..01e7b79a1
--- /dev/null
+++ b/reports/smol-developer/file8_07-20-08-18.json
@@ -0,0 +1,266 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-20-08:19",
+    "metrics": {
+        "run_time": "48.44 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "8.826 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "9.169 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "18.189 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "10.634 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "1.403 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.002 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/smol-developer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "generated"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 5f795e203a893d0707812a6ec04dacd314a706eb Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Thu, 20 Jul 2023 08:20:13 +0000
Subject: gpt-engineer-20230720082013

---
 reports/gpt-engineer/file7_07-20-08-18.json | 267 ++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 reports/gpt-engineer/file7_07-20-08-18.json

diff --git a/reports/gpt-engineer/file7_07-20-08-18.json b/reports/gpt-engineer/file7_07-20-08-18.json
new file mode 100644
index 000000000..ae1bcaf81
--- /dev/null
+++ b/reports/gpt-engineer/file7_07-20-08-18.json
@@ -0,0 +1,267 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-20-08:20",
+    "metrics": {
+        "run_time": "123.99 seconds",
+        "highest_difficulty": "interface: 1"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "65.136 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace'",
+                "success_%": 0.0,
+                "run_time": "57.021 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "0.756 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "[Errno 20] Not a directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/gpt-engineer/projects/my-new-project/workspace/file_to_check.txt'",
+                "success_%": 0.0,
+                "run_time": "0.774 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestThreeSum::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicCodeGeneration::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestReadFile::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestBasicMemory::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on agent/gpt-engineer/venv/lib/python3.10/site-packages/agbenchmark/challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "projects/my-new-project/workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From ad05c9886d527805b61f58e61333128406915122 Mon Sep 17 00:00:00 2001
From: Auto-GPT-Bot <github-bot@agpt.co>
Date: Thu, 20 Jul 2023 08:26:52 +0000
Subject: beebot-20230720082652

---
 reports/beebot/file2_07-20-08-18.json | 264 ++++++++++++++++++++++++++++++++++
 1 file changed, 264 insertions(+)
 create mode 100644 reports/beebot/file2_07-20-08-18.json

diff --git a/reports/beebot/file2_07-20-08-18.json b/reports/beebot/file2_07-20-08-18.json
new file mode 100644
index 000000000..4d423a445
--- /dev/null
+++ b/reports/beebot/file2_07-20-08-18.json
@@ -0,0 +1,264 @@
+{
+    "command": "agbenchmark start",
+    "completion_time": "2023-07-20-08:26",
+    "metrics": {
+        "run_time": "480.5 seconds",
+        "highest_difficulty": "advanced: 5"
+    },
+    "tests": {
+        "TestWriteFile": {
+            "data_path": "agbenchmark/challenges/interface/write_file",
+            "is_regression": false,
+            "task": "Print the the capital of America to a .txt file",
+            "answer": "Washington",
+            "description": "Tests the writing to file",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "22.969 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestBasicCodeGeneration": {
+            "data_path": "agbenchmark/challenges/code/d3_two_sum",
+            "is_regression": false,
+            "task": "Create a two_sum function in a file called code.py. Given an array of integers, return indices of the two numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 9, Because nums[0] + nums[1] = 2 + 7 = 9, return [0, 1].",
+            "answer": "The two_sum function coded properly.",
+            "description": "Tests ability for the agent to create the two_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "90.6 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestSearch": {
+            "data_path": "agbenchmark/challenges/interface/search",
+            "is_regression": false,
+            "task": "Open 'https://silennaihin.com/random/plain.html' and paste the text on the page in a .txt file",
+            "answer": "This is a Heading\nThis is a paragraph.",
+            "description": "Tests if an llm can search",
+            "metrics": {
+                "difficulty": "interface",
+                "success": false,
+                "fail_reason": "assert 1 in []",
+                "success_%": 0.0,
+                "run_time": "62.713 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestReadFile": {
+            "data_path": "agbenchmark/challenges/interface/read_file",
+            "is_regression": false,
+            "task": "Write the string 'random string' before any existing text to the file called file_to_check.txt",
+            "answer": "random string Hello World!",
+            "description": "This reads the file quickly",
+            "metrics": {
+                "difficulty": "interface",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "60.053 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestThreeSum": {
+            "data_path": "agbenchmark/challenges/code/d5_three_sum",
+            "is_regression": false,
+            "task": "Create a three_sum function in a file called code.py. Given an array of integers, return indices of the three numbers such that they add up to a specific target. You may assume that each input would have exactly one solution, and you may not use the same element twice. Example: Given nums = [2, 7, 11, 15], target = 20, Because nums[0] + nums[1] + nums[2] = 2 + 7 + 11 = 20, return [0, 1, 2].",
+            "answer": "The three_sum function coded properly.",
+            "description": "Tests ability for the agent to create the three_sum function.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "71.451 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicRetrieval": {
+            "data_path": "agbenchmark/challenges/retrieval/r1_book_price",
+            "is_regression": false,
+            "task": "Write the price of the book in this url 'http://books.toscrape.com/catalogue/meditations_33/index.html' to a .txt file.",
+            "answer": "\u00a325.89",
+            "description": "Specifies specific website to retrieve website from.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0] depends on challenges/test_all.py::TestSearch::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithGuidance": {
+            "data_path": "agbenchmark/challenges/code/d1_debug",
+            "is_regression": false,
+            "task": "1- Run test.py.\n2- Read code.py.\n3- Modify code.py.\nRepeat step 1, 2 and 3 until test.py runs without errors.\n",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it.",
+            "metrics": {
+                "difficulty": "basic",
+                "success": false,
+                "fail_reason": "assert 1 in [0.0]",
+                "success_%": 0.0,
+                "run_time": "79.582 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestBasicMemory": {
+            "data_path": "agbenchmark/challenges/memory/m1_id",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "2314",
+            "description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
+            "metrics": {
+                "difficulty": "basic",
+                "success": true,
+                "success_%": 100.0,
+                "run_time": "60.606 seconds"
+            },
+            "reached_cutoff": true
+        },
+        "TestRetrieval2.0": {
+            "data_path": "agbenchmark/challenges/retrieval/r2_tesla_revenue",
+            "is_regression": false,
+            "task": "Write tesla's revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "A no guardrails search for info",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0] depends on challenges/test_all.py::TestBasicRetrieval::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestDebugSimpleTypoWithoutGuidance": {
+            "data_path": "agbenchmark/challenges/code/d2_vague",
+            "is_regression": false,
+            "task": "Make test.py run without errors.",
+            "answer": "[0, 1] [2, 5] [0, 3]",
+            "description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.007 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestCreateSimpleWebServer": {
+            "data_path": "agbenchmark/challenges/code/d4_web_server",
+            "is_regression": false,
+            "task": "Build a basic web server that responds to a GET localhost:8079/health with a 200 OK. Deploy this web server locally at the port 8079. ",
+            "answer": "GET localhost:8079/health responds with a 200 OK",
+            "description": "Tests ability for the agent to build a simple web server locally",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleIds": {
+            "data_path": "agbenchmark/challenges/memory/m2_multiple",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "[Errno 2] No such file or directory: '/home/runner/work/Auto-GPT-Benchmarks/Auto-GPT-Benchmarks/agent/beebot/workspace/result.txt'",
+                "success_%": 0.0,
+                "run_time": "32.306 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.1": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.1_specific",
+            "is_regression": false,
+            "task": "Write Tesla's precise revenue in 2022 into a .txt file.",
+            "answer": "It was $81.462 billion in 2022.",
+            "description": "This one checks the accuracy of the information over r2",
+            "metrics": {
+                "difficulty": "novice",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.0::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultipleWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m3_noise",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "3145\n3791\n9317\n9471",
+            "description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleIds::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval3": {
+            "data_path": "agbenchmark/challenges/retrieval/r3",
+            "is_regression": false,
+            "task": "Write tesla's revenue every year since its creation into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "15 Millions\n112 Millions\n117 Millions\n204 Millions\n413 Millions\n2,014 Millions\n3,198 Millions\n4,046 Millions\n7,000 Millions\n11,759 Millions\n21,461 Millions\n24,578 Millions\n31,536 Millions\n53,823 Millions\n81,462 Millions",
+            "description": "Tests ability to retrieve information.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval3::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRetrieval2.2": {
+            "data_path": "agbenchmark/challenges/retrieval/r2.2_formatting",
+            "is_regression": false,
+            "task": "Write tesla's exact revenue in 2022 into a .txt file. Use the US notation, with a precision rounded to the nearest million dollars (for instance, $31,578 billion).",
+            "answer": "It was $81.462 billion in 2022. In millions the answer is 81,462.",
+            "description": "Advanced version of the r2.1 challenge that also asks for specific formatting.",
+            "metrics": {
+                "difficulty": "intermediate",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRetrieval2.2::test_method[challenge_data0] depends on challenges/test_all.py::TestRetrieval2.1::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        },
+        "TestRememberMultiplePhrasesWithNoise": {
+            "data_path": "agbenchmark/challenges/memory/m4_phrases",
+            "is_regression": false,
+            "task": "Follow the instructions in the instructions_1.txt file",
+            "answer": "The purple elephant danced on a rainbow while eating a taco\nThe sneaky toaster stole my socks and ran away to Hawaii\nMy pet rock sings better than Beyonc\u00e9 on Tuesdays\nThe giant hamster rode a unicycle through the crowded mall",
+            "description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
+            "metrics": {
+                "difficulty": "advanced",
+                "success": false,
+                "fail_reason": "challenges/test_all.py::TestRememberMultiplePhrasesWithNoise::test_method[challenge_data0] depends on challenges/test_all.py::TestRememberMultipleWithNoise::test_method[challenge_data0]",
+                "success_%": 0.0,
+                "run_time": "0.001 seconds"
+            },
+            "reached_cutoff": false
+        }
+    },
+    "config": {
+        "workspace": "workspace"
+    }
+}
\ No newline at end of file
-- 
cgit v1.2.3


From 307644a8c5560d63e0eed588322ec65f709d67f6 Mon Sep 17 00:00:00 2001
From: ph-ausseil <ph.ausseil@gmail.com>
Date: Thu, 20 Jul 2023 16:42:39 +0200
Subject: runner.cli parsers set as a library (#5021)

* INIT 1/2

* INIT 2/2

* LINT

---------

Co-authored-by: James Collins <collijk@uw.edu>
---
 autogpt/core/runner/cli_app/main.py      | 53 ++++----------------------------
 autogpt/core/runner/client_lib/parser.py | 45 +++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 47 deletions(-)
 create mode 100755 autogpt/core/runner/client_lib/parser.py

diff --git a/autogpt/core/runner/cli_app/main.py b/autogpt/core/runner/cli_app/main.py
index 60af24bec..e0d9689a5 100644
--- a/autogpt/core/runner/cli_app/main.py
+++ b/autogpt/core/runner/cli_app/main.py
@@ -2,6 +2,12 @@ import click
 
 from autogpt.core.agent import AgentSettings, SimpleAgent
 from autogpt.core.runner.client_lib.logging import get_client_logger
+from autogpt.core.runner.client_lib.parser import (
+    parse_ability_result,
+    parse_agent_name_and_goals,
+    parse_agent_plan,
+    parse_next_ability,
+)
 
 
 async def run_auto_gpt(user_configuration: dict):
@@ -61,50 +67,3 @@ async def run_auto_gpt(user_configuration: dict):
         )
         ability_result = await agent.execute_next_ability(user_input)
         print(parse_ability_result(ability_result))
-
-
-def parse_agent_name_and_goals(name_and_goals: dict) -> str:
-    parsed_response = f"Agent Name: {name_and_goals['agent_name']}\n"
-    parsed_response += f"Agent Role: {name_and_goals['agent_role']}\n"
-    parsed_response += "Agent Goals:\n"
-    for i, goal in enumerate(name_and_goals["agent_goals"]):
-        parsed_response += f"{i+1}. {goal}\n"
-    return parsed_response
-
-
-def parse_agent_plan(plan: dict) -> str:
-    parsed_response = f"Agent Plan:\n"
-    for i, task in enumerate(plan["task_list"]):
-        parsed_response += f"{i+1}. {task['objective']}\n"
-        parsed_response += f"Task type: {task['type']}  "
-        parsed_response += f"Priority: {task['priority']}\n"
-        parsed_response += f"Ready Criteria:\n"
-        for j, criteria in enumerate(task["ready_criteria"]):
-            parsed_response += f"    {j+1}. {criteria}\n"
-        parsed_response += f"Acceptance Criteria:\n"
-        for j, criteria in enumerate(task["acceptance_criteria"]):
-            parsed_response += f"    {j+1}. {criteria}\n"
-        parsed_response += "\n"
-
-    return parsed_response
-
-
-def parse_next_ability(current_task, next_ability: dict) -> str:
-    parsed_response = f"Current Task: {current_task.objective}\n"
-    ability_args = ", ".join(
-        f"{k}={v}" for k, v in next_ability["ability_arguments"].items()
-    )
-    parsed_response += f"Next Ability: {next_ability['next_ability']}({ability_args})\n"
-    parsed_response += f"Motivation: {next_ability['motivation']}\n"
-    parsed_response += f"Self-criticism: {next_ability['self_criticism']}\n"
-    parsed_response += f"Reasoning: {next_ability['reasoning']}\n"
-    return parsed_response
-
-
-def parse_ability_result(ability_result) -> str:
-    parsed_response = f"Ability: {ability_result['ability_name']}\n"
-    parsed_response += f"Ability Arguments: {ability_result['ability_args']}\n"
-    parsed_response = f"Ability Result: {ability_result['success']}\n"
-    parsed_response += f"Message: {ability_result['message']}\n"
-    parsed_response += f"Data: {ability_result['new_knowledge']}\n"
-    return parsed_response
diff --git a/autogpt/core/runner/client_lib/parser.py b/autogpt/core/runner/client_lib/parser.py
new file mode 100755
index 000000000..9246ea82d
--- /dev/null
+++ b/autogpt/core/runner/client_lib/parser.py
@@ -0,0 +1,45 @@
+def parse_agent_name_and_goals(name_and_goals: dict) -> str:
+    parsed_response = f"Agent Name: {name_and_goals['agent_name']}\n"
+    parsed_response += f"Agent Role: {name_and_goals['agent_role']}\n"
+    parsed_response += "Agent Goals:\n"
+    for i, goal in enumerate(name_and_goals["agent_goals"]):
+        parsed_response += f"{i+1}. {goal}\n"
+    return parsed_response
+
+
+def parse_agent_plan(plan: dict) -> str:
+    parsed_response = f"Agent Plan:\n"
+    for i, task in enumerate(plan["task_list"]):
+        parsed_response += f"{i+1}. {task['objective']}\n"
+        parsed_response += f"Task type: {task['type']}  "
+        parsed_response += f"Priority: {task['priority']}\n"
+        parsed_response += f"Ready Criteria:\n"
+        for j, criteria in enumerate(task["ready_criteria"]):
+            parsed_response += f"    {j+1}. {criteria}\n"
+        parsed_response += f"Acceptance Criteria:\n"
+        for j, criteria in enumerate(task["acceptance_criteria"]):
+            parsed_response += f"    {j+1}. {criteria}\n"
+        parsed_response += "\n"
+
+    return parsed_response
+
+
+def parse_next_ability(current_task, next_ability: dict) -> str:
+    parsed_response = f"Current Task: {current_task.objective}\n"
+    ability_args = ", ".join(
+        f"{k}={v}" for k, v in next_ability["ability_arguments"].items()
+    )
+    parsed_response += f"Next Ability: {next_ability['next_ability']}({ability_args})\n"
+    parsed_response += f"Motivation: {next_ability['motivation']}\n"
+    parsed_response += f"Self-criticism: {next_ability['self_criticism']}\n"
+    parsed_response += f"Reasoning: {next_ability['reasoning']}\n"
+    return parsed_response
+
+
+def parse_ability_result(ability_result) -> str:
+    parsed_response = f"Ability: {ability_result['ability_name']}\n"
+    parsed_response += f"Ability Arguments: {ability_result['ability_args']}\n"
+    parsed_response = f"Ability Result: {ability_result['success']}\n"
+    parsed_response += f"Message: {ability_result['message']}\n"
+    parsed_response += f"Data: {ability_result['new_knowledge']}\n"
+    return parsed_response
-- 
cgit v1.2.3


From db95d4cb842ea1c7e7eea5d93e525c5b25127a5c Mon Sep 17 00:00:00 2001
From: Reinier van der Leer <reinier.vanderleer@agpt.co>
Date: Thu, 20 Jul 2023 17:34:49 +0200
Subject: Agent loop v2: Planning & Task Management (part 1: refactoring)
 (#4799)

* Move rename module `agent` -> `agents`

* WIP: abstract agent structure into base class and port Agent

* Move command arg path sanitization to decorator

* Add fallback token limit in llm.utils.create_chat_completion

* Rebase `MessageHistory` class on `ChatSequence` class

* Fix linting

* Consolidate logging modules

* Wham Bam Boom

* Fix tests & linting complaints

* Update Agent class docstring

* Fix Agent import in autogpt.llm.providers.openai

* Fix agent kwarg in test_execute_code.py

* Fix benchmarks.py

* Clean up lingering Agent(ai_name=...) initializations

* Fix agent kwarg

* Make sanitize_path_arg decorator more robust

* Fix linting

* Fix command enabling lambda's

* Use relative paths in file ops logger

* Fix test_execute_python_file_not_found

* Fix Config model validation breaking on .plugins

* Define validator for Config.plugins

* Fix Config model issues

* Fix agent iteration budget in testing

* Fix declaration of context_while_think

* Fix Agent.parse_and_process_response signature

* Fix Agent cycle_budget usages

* Fix budget checking in BaseAgent.__next__

* Fix cycle budget initialization

* Fix function calling in BaseAgent.think()

* Include functions in token length calculation

* Fix Config errors

* Add debug thing to patched_api_requestor to investigate HTTP 400 errors

* If this works I'm gonna be sad

* Fix BaseAgent cycle budget logic and document attributes

* Document attributes on `Agent`

* Fix import issues between Agent and MessageHistory

* Improve typing

* Extract application code from the agent (#4982)

* Extract application code from the agent

* Wrap interaction loop in a function and call in benchmarks

* Forgot the important function call

* Add docstrings and inline comments to run loop

* Update typing and docstrings in agent

* Docstring formatting

* Separate prompt construction from on_before_think

* Use `self.default_cycle_instruction` in `Agent.think()`

* Fix formatting

* hot fix the SIGINT handler (#4997)

The signal handler in the autogpt/main.py doesn't work properly because
of the clean_input(...) func. This commit remedies this issue. The issue
is mentioned in
https://github.com/Significant-Gravitas/Auto-GPT/pull/4799/files/3966cdfd694c2a80c0333823c3bc3da090f85ed3#r1264278776

* Update the sigint handler to be smart enough to actually work (#4999)

* Update the sigint handler to be smart enough to actually work

* Update autogpt/main.py

Co-authored-by: Reinier van der Leer <github@pwuts.nl>

* Can still use context manager

* Merge in upstream

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>

* Fix CI

* Fix initial prompt construction

* off by one error

* allow exit/EXIT to shut down app

* Remove dead code

---------

Co-authored-by: collijk <collijk@uw.edu>
Co-authored-by: Cyrus <39694513+cyrus-hawk@users.noreply.github.com>
---
 autogpt/agents/__init__.py                         |   3 +-
 autogpt/agents/agent.py                            | 448 ++++++++-------------
 autogpt/agents/base.py                             | 318 +++++++++++++++
 autogpt/json_utils/utilities.py                    |  26 +-
 autogpt/llm/__init__.py                            |   2 +
 autogpt/llm/chat.py                                | 203 ----------
 autogpt/llm/providers/openai.py                    |   2 +-
 autogpt/main.py                                    | 280 ++++++++++++-
 autogpt/memory/message_history.py                  |  51 ++-
 autogpt/setup.py                                   |   2 +-
 autogpt/spinner.py                                 |  30 +-
 autogpt/utils.py                                   |   6 +-
 benchmarks.py                                      |   8 +-
 docs/challenges/building_challenges.md             |   4 -
 .../debug_code/test_debug_code_challenge_a.py      |   2 +-
 tests/challenges/utils.py                          |   2 +-
 tests/conftest.py                                  |   8 +-
 tests/integration/agent_factory.py                 |   3 -
 tests/integration/test_execute_code.py             |   4 +-
 tests/unit/test_agent.py                           |   5 +-
 tests/unit/test_message_history.py                 |  10 +-
 tests/unit/test_spinner.py                         |  19 +-
 tests/unit/test_utils.py                           |  14 +-
 tests/vcr/__init__.py                              |   4 +
 24 files changed, 860 insertions(+), 594 deletions(-)
 create mode 100644 autogpt/agents/base.py
 delete mode 100644 autogpt/llm/chat.py

diff --git a/autogpt/agents/__init__.py b/autogpt/agents/__init__.py
index a6df24ad7..94a5f42a5 100644
--- a/autogpt/agents/__init__.py
+++ b/autogpt/agents/__init__.py
@@ -1,3 +1,4 @@
 from .agent import Agent
+from .base import AgentThoughts, BaseAgent, CommandArgs, CommandName
 
-__all__ = ["Agent"]
+__all__ = ["BaseAgent", "Agent", "CommandName", "CommandArgs", "AgentThoughts"]
diff --git a/autogpt/agents/agent.py b/autogpt/agents/agent.py
index 316cc4d44..f3fee609c 100644
--- a/autogpt/agents/agent.py
+++ b/autogpt/agents/agent.py
@@ -1,315 +1,215 @@
+from __future__ import annotations
+
 import json
-import signal
-import sys
+import time
 from datetime import datetime
 from pathlib import Path
+from typing import TYPE_CHECKING, Any, Optional
 
-from colorama import Fore, Style
+if TYPE_CHECKING:
+    from autogpt.config import AIConfig, Config
+    from autogpt.llm.base import ChatModelResponse, ChatSequence
+    from autogpt.memory.vector import VectorMemory
+    from autogpt.models.command_registry import CommandRegistry
 
-from autogpt.config import Config
-from autogpt.config.ai_config import AIConfig
-from autogpt.json_utils.utilities import extract_json_from_response, validate_json
-from autogpt.llm import ChatModelResponse
-from autogpt.llm.chat import chat_with_ai
-from autogpt.llm.providers.openai import OPEN_AI_CHAT_MODELS
+from autogpt.json_utils.utilities import extract_dict_from_response, validate_dict
+from autogpt.llm.api_manager import ApiManager
+from autogpt.llm.base import Message
 from autogpt.llm.utils import count_string_tokens
-from autogpt.logs import (
+from autogpt.logs import logger
+from autogpt.logs.log_cycle import (
     FULL_MESSAGE_HISTORY_FILE_NAME,
     NEXT_ACTION_FILE_NAME,
     USER_INPUT_FILE_NAME,
     LogCycleHandler,
-    logger,
-    print_assistant_thoughts,
-    remove_ansi_escape,
 )
-from autogpt.memory.message_history import MessageHistory
-from autogpt.memory.vector import VectorMemory
-from autogpt.models.command_registry import CommandRegistry
-from autogpt.speech import say_text
-from autogpt.spinner import Spinner
-from autogpt.utils import clean_input
 from autogpt.workspace import Workspace
 
+from .base import AgentThoughts, BaseAgent, CommandArgs, CommandName
 
-class Agent:
-    """Agent class for interacting with Auto-GPT.
-
-    Attributes:
-        ai_name: The name of the agent.
-        memory: The memory object to use.
-        next_action_count: The number of actions to execute.
-        system_prompt: The system prompt is the initial prompt that defines everything
-          the AI needs to know to achieve its task successfully.
-        Currently, the dynamic and customizable information in the system prompt are
-          ai_name, description and goals.
-
-        triggering_prompt: The last sentence the AI will see before answering.
-            For Auto-GPT, this prompt is:
-            Determine exactly one command to use, and respond using the format specified
-              above:
-            The triggering prompt is not part of the system prompt because between the
-              system prompt and the triggering
-            prompt we have contextual information that can distract the AI and make it
-              forget that its goal is to find the next task to achieve.
-            SYSTEM PROMPT
-            CONTEXTUAL INFORMATION (memory, previous conversations, anything relevant)
-            TRIGGERING PROMPT
-
-        The triggering prompt reminds the AI about its short term meta task
-        (defining the next task)
-    """
+
+class Agent(BaseAgent):
+    """Agent class for interacting with Auto-GPT."""
 
     def __init__(
         self,
-        ai_name: str,
-        memory: VectorMemory,
-        next_action_count: int,
-        command_registry: CommandRegistry,
         ai_config: AIConfig,
-        system_prompt: str,
+        command_registry: CommandRegistry,
+        memory: VectorMemory,
         triggering_prompt: str,
         workspace_directory: str | Path,
         config: Config,
+        cycle_budget: Optional[int] = None,
     ):
-        self.ai_name = ai_name
+        super().__init__(
+            ai_config=ai_config,
+            command_registry=command_registry,
+            config=config,
+            default_cycle_instruction=triggering_prompt,
+            cycle_budget=cycle_budget,
+        )
+
         self.memory = memory
-        self.history = MessageHistory.for_model(config.smart_llm, agent=self)
-        self.next_action_count = next_action_count
-        self.command_registry = command_registry
-        self.config = config
-        self.ai_config = ai_config
-        self.system_prompt = system_prompt
-        self.triggering_prompt = triggering_prompt
+        """VectorMemoryProvider used to manage the agent's context (TODO)"""
+
         self.workspace = Workspace(workspace_directory, config.restrict_to_workspace)
+        """Workspace that the agent has access to, e.g. for reading/writing files."""
+
         self.created_at = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self.cycle_count = 0
+        """Timestamp the agent was created; only used for structured debug logging."""
+
         self.log_cycle_handler = LogCycleHandler()
-        self.smart_token_limit = OPEN_AI_CHAT_MODELS.get(config.smart_llm).max_tokens
-
-    def start_interaction_loop(self):
-        # Interaction Loop
-        self.cycle_count = 0
-        command_name = None
-        arguments = None
-        user_input = ""
-
-        # Signal handler for interrupting y -N
-        def signal_handler(signum, frame):
-            if self.next_action_count == 0:
-                sys.exit()
-            else:
-                print(
-                    Fore.RED
-                    + "Interrupt signal received. Stopping continuous command execution."
-                    + Style.RESET_ALL
-                )
-                self.next_action_count = 0
+        """LogCycleHandler for structured debug logging."""
+
+    def construct_base_prompt(self, *args, **kwargs) -> ChatSequence:
+        if kwargs.get("prepend_messages") is None:
+            kwargs["prepend_messages"] = []
+
+        # Clock
+        kwargs["prepend_messages"].append(
+            Message("system", f"The current time and date is {time.strftime('%c')}"),
+        )
 
-        signal.signal(signal.SIGINT, signal_handler)
+        # Add budget information (if any) to prompt
+        api_manager = ApiManager()
+        if api_manager.get_total_budget() > 0.0:
+            remaining_budget = (
+                api_manager.get_total_budget() - api_manager.get_total_cost()
+            )
+            if remaining_budget < 0:
+                remaining_budget = 0
+
+            budget_msg = Message(
+                "system",
+                f"Your remaining API budget is ${remaining_budget:.3f}"
+                + (
+                    " BUDGET EXCEEDED! SHUT DOWN!\n\n"
+                    if remaining_budget == 0
+                    else " Budget very nearly exceeded! Shut down gracefully!\n\n"
+                    if remaining_budget < 0.005
+                    else " Budget nearly exceeded. Finish up.\n\n"
+                    if remaining_budget < 0.01
+                    else ""
+                ),
+            )
+            logger.debug(budget_msg)
+
+            if kwargs.get("append_messages") is None:
+                kwargs["append_messages"] = []
+            kwargs["append_messages"].append(budget_msg)
+
+        return super().construct_base_prompt(*args, **kwargs)
+
+    def on_before_think(self, *args, **kwargs) -> ChatSequence:
+        prompt = super().on_before_think(*args, **kwargs)
+
+        self.log_cycle_handler.log_count_within_cycle = 0
+        self.log_cycle_handler.log_cycle(
+            self.ai_config.ai_name,
+            self.created_at,
+            self.cycle_count,
+            self.history.raw(),
+            FULL_MESSAGE_HISTORY_FILE_NAME,
+        )
+        return prompt
 
-        while True:
-            # Discontinue if continuous limit is reached
-            self.cycle_count += 1
-            self.log_cycle_handler.log_count_within_cycle = 0
+    def execute(
+        self,
+        command_name: str | None,
+        command_args: dict[str, str] | None,
+        user_input: str | None,
+    ) -> str:
+        # Execute command
+        if command_name is not None and command_name.lower().startswith("error"):
+            result = f"Could not execute command: {command_name}{command_args}"
+        elif command_name == "human_feedback":
+            result = f"Human feedback: {user_input}"
             self.log_cycle_handler.log_cycle(
                 self.ai_config.ai_name,
                 self.created_at,
                 self.cycle_count,
-                [m.raw() for m in self.history],
-                FULL_MESSAGE_HISTORY_FILE_NAME,
+                user_input,
+                USER_INPUT_FILE_NAME,
             )
-            if (
-                self.config.continuous_mode
-                and self.config.continuous_limit > 0
-                and self.cycle_count > self.config.continuous_limit
-            ):
-                logger.typewriter_log(
-                    "Continuous Limit Reached: ",
-                    Fore.YELLOW,
-                    f"{self.config.continuous_limit}",
-                )
-                break
-            # Send message to AI, get response
-            with Spinner("Thinking... ", plain_output=self.config.plain_output):
-                assistant_reply = chat_with_ai(
-                    self.config,
-                    self,
-                    self.system_prompt,
-                    self.triggering_prompt,
-                    self.smart_token_limit,
-                    self.config.smart_llm,
-                )
-
-            try:
-                assistant_reply_json = extract_json_from_response(
-                    assistant_reply.content
-                )
-                validate_json(assistant_reply_json, self.config)
-            except json.JSONDecodeError as e:
-                logger.error(f"Exception while validating assistant reply JSON: {e}")
-                assistant_reply_json = {}
 
+        else:
             for plugin in self.config.plugins:
-                if not plugin.can_handle_post_planning():
+                if not plugin.can_handle_pre_command():
                     continue
-                assistant_reply_json = plugin.post_planning(assistant_reply_json)
-
-            # Print Assistant thoughts
-            if assistant_reply_json != {}:
-                # Get command name and arguments
-                try:
-                    print_assistant_thoughts(
-                        self.ai_name, assistant_reply_json, self.config
-                    )
-                    command_name, arguments = extract_command(
-                        assistant_reply_json, assistant_reply, self.config
-                    )
-                    if self.config.speak_mode:
-                        say_text(f"I want to execute {command_name}", self.config)
-
-                except Exception as e:
-                    logger.error("Error: \n", str(e))
-            self.log_cycle_handler.log_cycle(
-                self.ai_config.ai_name,
-                self.created_at,
-                self.cycle_count,
-                assistant_reply_json,
-                NEXT_ACTION_FILE_NAME,
+                command_name, arguments = plugin.pre_command(command_name, command_args)
+            command_result = execute_command(
+                command_name=command_name,
+                arguments=command_args,
+                agent=self,
             )
+            result = f"Command {command_name} returned: " f"{command_result}"
 
-            # First log new-line so user can differentiate sections better in console
-            logger.typewriter_log("\n")
-            logger.typewriter_log(
-                "NEXT ACTION: ",
-                Fore.CYAN,
-                f"COMMAND = {Fore.CYAN}{remove_ansi_escape(command_name)}{Style.RESET_ALL}  "
-                f"ARGUMENTS = {Fore.CYAN}{arguments}{Style.RESET_ALL}",
+            result_tlength = count_string_tokens(str(command_result), self.llm.name)
+            memory_tlength = count_string_tokens(
+                str(self.history.summary_message()), self.llm.name
             )
+            if result_tlength + memory_tlength > self.send_token_limit:
+                result = f"Failure: command {command_name} returned too much output. \
+                    Do not execute this command again with the same arguments."
 
-            if not self.config.continuous_mode and self.next_action_count == 0:
-                # ### GET USER AUTHORIZATION TO EXECUTE COMMAND ###
-                # Get key press: Prompt the user to press enter to continue or escape
-                # to exit
-                self.user_input = ""
-                logger.info(
-                    f"Enter '{self.config.authorise_key}' to authorise command, "
-                    f"'{self.config.authorise_key} -N' to run N continuous commands, "
-                    f"'{self.config.exit_key}' to exit program, or enter feedback for "
-                    f"{self.ai_name}..."
-                )
-                while True:
-                    if self.config.chat_messages_enabled:
-                        console_input = clean_input(
-                            self.config, "Waiting for your response..."
-                        )
-                    else:
-                        console_input = clean_input(
-                            self.config, Fore.MAGENTA + "Input:" + Style.RESET_ALL
-                        )
-                    if console_input.lower().strip() == self.config.authorise_key:
-                        user_input = "GENERATE NEXT COMMAND JSON"
-                        break
-                    elif console_input.lower().strip() == "":
-                        logger.warn("Invalid input format.")
-                        continue
-                    elif console_input.lower().startswith(
-                        f"{self.config.authorise_key} -"
-                    ):
-                        try:
-                            self.next_action_count = abs(
-                                int(console_input.split(" ")[1])
-                            )
-                            user_input = "GENERATE NEXT COMMAND JSON"
-                        except ValueError:
-                            logger.warn(
-                                f"Invalid input format. Please enter '{self.config.authorise_key} -n' "
-                                "where n is the number of continuous tasks."
-