aboutsummaryrefslogtreecommitdiff
path: root/benchmark
diff options
context:
space:
mode:
authorGravatar Silen Naihin <silen.naihin@gmail.com> 2023-10-20 08:26:29 -0700
committerGravatar Silen Naihin <silen.naihin@gmail.com> 2023-10-20 08:26:29 -0700
commit825c3adf62879fa9f91a19c11010336de5c98bfc (patch)
tree76d44228577bfe5d50efaa0f8c4df7e9d9532b36 /benchmark
parentMerge branch 'master' of github.com:Significant-Gravitas/Auto-GPT (diff)
downloadAuto-GPT-825c3adf62879fa9f91a19c11010336de5c98bfc.tar.gz
Auto-GPT-825c3adf62879fa9f91a19c11010336de5c98bfc.tar.bz2
Auto-GPT-825c3adf62879fa9f91a19c11010336de5c98bfc.zip
case sensitivity, updating challenges
Diffstat (limited to 'benchmark')
-rw-r--r--benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json1
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json1
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json3
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json1
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json1
-rw-r--r--benchmark/agbenchmark/utils/challenge.py6
-rw-r--r--benchmark/agbenchmark/utils/data_types.py1
7 files changed, 13 insertions, 1 deletions
diff --git a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json
index d4967c1b0..dfbcca141 100644
--- a/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/data/2_label_csv/data.json
@@ -9,6 +9,7 @@
"eval_id": "6e2bf1f0-6842-4704-8ed1-b17c2065bbac",
"ground": {
"answer": "The csv labelled",
+ "case_sensitive": true,
"eval": {
"type": "file"
},
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
index 703636bcb..f198d2e4e 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
@@ -10,6 +10,7 @@
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
"ground": {
"answer": "The twitter handles of the two hosts of Latent Space.",
+ "case_sensitive": false,
"eval": {
"type": "file"
},
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json
index c3c069f2e..3a6962350 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_autogpt_creator/data.json
@@ -10,6 +10,7 @@
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
"ground": {
"answer": "Toran is from Scotland https://uk.linkedin.com/in/toran-richards.",
+ "case_sensitive": true,
"eval": {
"type": "file"
},
@@ -17,7 +18,7 @@
"output.txt"
],
"should_contain": [
- "cotland"
+ "Scotland"
],
"should_not_contain": []
},
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json
index bb7337024..4e1327628 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json
@@ -11,6 +11,7 @@
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
"ground": {
"answer": "https://www.amazon.com/gp/bestsellers/2021/books, second book, the answer is Reed",
+ "case_sensitive": true,
"eval": {
"type": "file"
},
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json
index 48db48cea..a4fe85c6a 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json
@@ -11,6 +11,7 @@
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
"ground": {
"answer": "Get to https://www.forbes.com/special-report/2012/30-under-30/30-under-30_games.html, then https://www.linkedin.com/in/brianjcho/details/experience/ is the first 27 year old, then find his longest working job on Linkedin which is Riot Games.",
+ "case_sensitive": true,
"eval": {
"type": "file"
},
diff --git a/benchmark/agbenchmark/utils/challenge.py b/benchmark/agbenchmark/utils/challenge.py
index 36bf0af22..20353f685 100644
--- a/benchmark/agbenchmark/utils/challenge.py
+++ b/benchmark/agbenchmark/utils/challenge.py
@@ -123,6 +123,9 @@ class Challenge(ABC):
print("\033[1;34mScoring content:\033[0m", content)
if ground.should_contain:
for should_contain_word in ground.should_contain:
+ if not getattr(ground, 'case_sensitive', True):
+ should_contain_word = should_contain_word.lower()
+ content = content.lower()
print_content = (
f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
)
@@ -134,6 +137,9 @@ class Challenge(ABC):
if ground.should_not_contain:
for should_not_contain_word in ground.should_not_contain:
+ if not getattr(ground, 'case_sensitive', True):
+ should_not_contain_word = should_not_contain_word.lower()
+ content = content.lower()
print_content = f"\033[1;34mWord that should not exist\033[0m - {should_not_contain_word}:"
if should_not_contain_word in content:
print(print_content, "False")
diff --git a/benchmark/agbenchmark/utils/data_types.py b/benchmark/agbenchmark/utils/data_types.py
index 8f1e53d0f..74b509329 100644
--- a/benchmark/agbenchmark/utils/data_types.py
+++ b/benchmark/agbenchmark/utils/data_types.py
@@ -165,6 +165,7 @@ class Ground(BaseModel):
should_contain: Optional[List[str]] = None
should_not_contain: Optional[List[str]] = None
files: List[str]
+ case_sensitive: Optional[bool] = True
eval: Eval