aboutsummaryrefslogtreecommitdiff
path: root/benchmark
diff options
context:
space:
mode:
authorGravatar Silen Naihin <silen.naihin@gmail.com> 2023-10-19 17:39:09 -0700
committerGravatar Silen Naihin <silen.naihin@gmail.com> 2023-10-19 17:39:09 -0700
commit7ddef399189ff55d37daff741cf28e50e8ab4f1c (patch)
tree45edc6369801523201da13ea08f3e00964d7a8d8 /benchmark
parentFix typo in exceptions.py (#5813) (diff)
downloadAuto-GPT-7ddef399189ff55d37daff741cf28e50e8ab4f1c.tar.gz
Auto-GPT-7ddef399189ff55d37daff741cf28e50e8ab4f1c.tar.bz2
Auto-GPT-7ddef399189ff55d37daff741cf28e50e8ab4f1c.zip
scrape synthesize challenge additions
Diffstat (limited to 'benchmark')
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json3
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json2
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/artifacts_out/output.txt1
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/data.json34
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/artifacts_out/output.txt1
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json34
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/artifacts_out/output.txt1
-rw-r--r--benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json34
-rw-r--r--benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/artifacts_out/output.txt24
-rw-r--r--benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json4
10 files changed, 133 insertions, 5 deletions
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json
index c57835aef..8ca61b9e7 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/4_revenue_retrieval_2/data.json
@@ -21,8 +21,7 @@
"117",
"204",
"413",
- "2,013",
- "2,014",
+ "2,0",
"3,198",
"4,046",
"7,000",
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
index bec0b9c84..703636bcb 100644
--- a/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/5_get_information/data.json
@@ -5,7 +5,7 @@
],
"cutoff": 60,
"dependencies": [
- "TestRevenueRetrieval2"
+ "TestRevenueRetrieval"
],
"eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
"ground": {
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/artifacts_out/output.txt
new file mode 100644
index 000000000..c37330ccc
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/artifacts_out/output.txt
@@ -0,0 +1 @@
+Scotland \ No newline at end of file
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/data.json
new file mode 100644
index 000000000..112df0cb8
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/6_find_cpo_microsoft/data.json
@@ -0,0 +1,34 @@
+{
+ "category": [
+ "scrape_synthesize",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestRevenueRetrieval"
+ ],
+ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
+ "ground": {
+ "answer": "Toran is from Scotland https://uk.linkedin.com/in/toran-richards.",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ "output.txt"
+ ],
+ "should_contain": [
+ "Scotland",
+ "scotland"
+ ],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can do multi step retrieval (1. who is the CEO of AutoGPT 2. find Toran is from)",
+ "difficulty": "intermediate",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "TestAutoGPTCountry",
+ "task": "Write the country the the inventor/creator of AutoGPT is from to a file called output.txt"
+}
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/artifacts_out/output.txt
new file mode 100644
index 000000000..270f18b41
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/artifacts_out/output.txt
@@ -0,0 +1 @@
+Reed \ No newline at end of file
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json
new file mode 100644
index 000000000..bb7337024
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/7_amazon_bestseller/data.json
@@ -0,0 +1,34 @@
+{
+ "category": [
+ "scrape_synthesize",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestAutoGPTCountry",
+ "TestGetInformation"
+ ],
+ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
+ "ground": {
+ "answer": "https://www.amazon.com/gp/bestsellers/2021/books, second book, the answer is Reed",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ "output.txt"
+ ],
+ "should_contain": [
+ "Reed"
+ ],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can follow very specific instructions for retrieval.",
+ "difficulty": "intermediate",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "TestAmazonBestseller",
+ "task": "Write the middle name of the author of the best selling book on Amazon from 2021 that got the least amount of reviews out of the first six books on the best selling list to a file called output.txt."
+}
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/artifacts_out/output.txt
new file mode 100644
index 000000000..22696ea3b
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/artifacts_out/output.txt
@@ -0,0 +1 @@
+Head of Ventures, M&A, and Business Development \ No newline at end of file
diff --git a/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json
new file mode 100644
index 000000000..48db48cea
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/verticals/scrape/8_forbes_list/data.json
@@ -0,0 +1,34 @@
+{
+ "category": [
+ "scrape_synthesize",
+ "general"
+ ],
+ "cutoff": 60,
+ "dependencies": [
+ "TestAutoGPTCountry",
+ "TestGetInformation"
+ ],
+ "eval_id": "1758058c-f726-484f-96fa-f05e278e5ff5",
+ "ground": {
+ "answer": "Get to https://www.forbes.com/special-report/2012/30-under-30/30-under-30_games.html, then https://www.linkedin.com/in/brianjcho/details/experience/ is the first 27 year old, then find his longest working job on Linkedin which is Riot Games.",
+ "eval": {
+ "type": "file"
+ },
+ "files": [
+ "output.txt"
+ ],
+ "should_contain": [
+ "Head of Ventures, M&A, and Business Development"
+ ],
+ "should_not_contain": []
+ },
+ "info": {
+ "description": "Tests if the agent can navigate the web through trials and tribulations.",
+ "difficulty": "intermediate",
+ "side_effects": [
+ ""
+ ]
+ },
+ "name": "TestAmazonBestseller",
+ "task": "Find the Forbes 30 under 30 list for 2012. For the Games category, find the company that the first 27 year old on the list spend the longest amount of time working at according to Linkedin. What was his role at this company?"
+}
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/artifacts_out/output.txt b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/artifacts_out/output.txt
index e69de29bb..4e2fedb30 100644
--- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/artifacts_out/output.txt
+++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/artifacts_out/output.txt
@@ -0,0 +1,24 @@
+Addressing Challenges and Capitalizing on Trends: A Corporate Perspective
+
+As global issues evolve, companies are innovating and pivoting to meet the challenges and capture market opportunities. This report highlights how certain companies from our database are addressing or capitalizing on significant challenges and trends:
+
+1. Rising levels of air pollution in major cities:
+
+Tesla is at the forefront of the electric vehicle industry. By producing electric cars, they help to reduce the emissions that contribute to air pollution in cities. Additionally, with their solar panels and energy storage solutions, they offer cleaner alternatives to conventional energy sources, further combating air pollution.
+2. The decline of linguistic diversity and death of minor languages:
+
+Duolingo provides a platform for language learning, potentially aiding in the preservation of minor languages. By offering a wide variety of languages, including less commonly spoken ones, they encourage users to learn and potentially contribute to the continuation of linguistic diversity.
+3. Increased demand for sustainable and eco-friendly products:
+
+Beyond Meat is addressing the demand for more sustainable food products by offering plant-based meat substitutes. These products cater to consumers concerned about the environmental and ethical implications of meat consumption.
+
+Ecolife Recycling offers biodegradable products and eco-friendly packaging solutions, catering directly to the demand for sustainable products and reducing the environmental footprint of packaging waste.
+
+4. The remote work revolution due to global pandemics:
+
+Zoom has become a household name in the realm of video conferencing software, capitalizing on the remote work trend. With companies and schools transitioning to remote settings during global pandemics, Zoom's platform facilitates effective communication and collaboration.
+5. Growing concerns about meat consumption's environmental and ethical implications:
+
+As mentioned, Beyond Meat addresses this challenge by providing consumers with plant-based meat substitutes, presenting an alternative that reduces the environmental strain and ethical concerns associated with traditional meat production.
+In conclusion, businesses are actively aligning their products and services to address emerging challenges and capitalize on trends. This alignment not only ensures their relevance but also signifies their commitment to global well-being.
+
diff --git a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
index 4b6c7073f..68ae89288 100644
--- a/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
+++ b/benchmark/agbenchmark/challenges/verticals/synthesize/1_basic_content_gen/data.json
@@ -9,7 +9,7 @@
],
"eval_id": "895ae28a-4513-44ea-a872-0164771d1597",
"ground": {
- "answer": "A report highlighting elements from the 2 files.",
+ "answer": "Is the company mentioned in the output actively addressing or capitalizing on the challenges or trends listed?",
"eval": {
"scoring": "binary",
"template": "question",
@@ -19,7 +19,7 @@
"output.txt"
],
"should_contain": [
- "Is the company mentioned in the output actively addressing or capitalizing on the challenges or trends listed?"
+ ""
],
"should_not_contain": []
},