aboutsummaryrefslogtreecommitdiff
path: root/benchmark/tests/test_benchmark_workflow.py
diff options
context:
space:
mode:
Diffstat (limited to 'benchmark/tests/test_benchmark_workflow.py')
-rw-r--r--benchmark/tests/test_benchmark_workflow.py83
1 files changed, 83 insertions, 0 deletions
diff --git a/benchmark/tests/test_benchmark_workflow.py b/benchmark/tests/test_benchmark_workflow.py
new file mode 100644
index 000000000..ca3eec88b
--- /dev/null
+++ b/benchmark/tests/test_benchmark_workflow.py
@@ -0,0 +1,83 @@
+import pytest
+import requests
+
+URL_BENCHMARK = "http://localhost:8080/ap/v1"
+URL_AGENT = "http://localhost:8000/ap/v1"
+
+import datetime
+import time
+
+
+@pytest.mark.parametrize(
+ "eval_id, input_text, expected_artifact_length, test_name, should_be_successful",
+ [
+ (
+ "021c695a-6cc4-46c2-b93a-f3a9b0f4d123",
+ "Write the word 'Washington' to a .txt file",
+ 0,
+ "WriteFile",
+ True,
+ ),
+ (
+ "f219f3d3-a41b-45a9-a3d0-389832086ee8",
+ "Read the file called file_to_read.txt and write its content to a file called output.txt",
+ 1,
+ "ReadFile",
+ False,
+ ),
+ ],
+)
+def test_entire_workflow(
+ eval_id, input_text, expected_artifact_length, test_name, should_be_successful
+):
+ task_request = {"eval_id": eval_id, "input": input_text}
+ response = requests.get(f"{URL_AGENT}/agent/tasks")
+ task_count_before = response.json()["pagination"]["total_items"]
+ # First POST request
+ task_response_benchmark = requests.post(
+ URL_BENCHMARK + "/agent/tasks", json=task_request
+ )
+ response = requests.get(f"{URL_AGENT}/agent/tasks")
+ task_count_after = response.json()["pagination"]["total_items"]
+ assert task_count_after == task_count_before + 1
+
+ timestamp_after_task_eval_created = datetime.datetime.now(datetime.timezone.utc)
+ time.sleep(1.1) # To make sure the 2 timestamps to compare are different
+ assert task_response_benchmark.status_code == 200
+ task_response_benchmark = task_response_benchmark.json()
+ assert task_response_benchmark["input"] == input_text
+
+ task_response_benchmark_id = task_response_benchmark["task_id"]
+
+ response_task_agent = requests.get(
+ f"{URL_AGENT}/agent/tasks/{task_response_benchmark_id}"
+ )
+ assert response_task_agent.status_code == 200
+ response_task_agent = response_task_agent.json()
+ assert len(response_task_agent["artifacts"]) == expected_artifact_length
+
+ step_request = {"input": input_text}
+
+ step_response = requests.post(
+ URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/steps",
+ json=step_request,
+ )
+ assert step_response.status_code == 200
+ step_response = step_response.json()
+ assert step_response["is_last"] == True # Assuming is_last is always True
+
+ eval_response = requests.post(
+ URL_BENCHMARK + "/agent/tasks/" + task_response_benchmark_id + "/evaluations",
+ json={},
+ )
+ assert eval_response.status_code == 200
+ eval_response = eval_response.json()
+ print("eval_response")
+ print(eval_response)
+ assert eval_response["run_details"]["test_name"] == test_name
+ assert eval_response["metrics"]["success"] == should_be_successful
+ benchmark_start_time = datetime.datetime.fromisoformat(
+ eval_response["run_details"]["benchmark_start_time"]
+ )
+
+ assert benchmark_start_time < timestamp_after_task_eval_created