feat(benchmark): JungleGym WebArena (#6691)

* feat(benchmark): Add JungleGym WebArena challenges - Add `WebArenaChallenge`, `WebArenaChallengeSpec`, and other logic to make these challenges work - Add WebArena challenges to Pytest collection endpoint generate_test.py * feat(benchmark/webarena): Add hand-picked selection of WebArena challenges
author: Reinier van der Leer <pwuts@agpt.co> 2024-01-19 20:34:04 +0100
committer: GitHub <noreply@github.com> 2024-01-19 20:34:04 +0100
commit: 488f40a20fbd4c5929611ed97ff6782c938b481d (patch)
tree: 1db933c2c0b544e72609b95f0f7b1ef6f8c3ea9d
parent: fix(benchmark/report): Fix and clean up logic in `update_challenges_already_b... (diff)
download: Auto-GPT-488f40a20fbd4c5929611ed97ff6782c938b481d.tar.gz
Auto-GPT-488f40a20fbd4c5929611ed97ff6782c938b481d.tar.bz2
Auto-GPT-488f40a20fbd4c5929611ed97ff6782c938b481d.zip
4 files changed, 1005 insertions, 1 deletions
diff --git a/benchmark/agbenchmark/challenges/__init__.py b/benchmark/agbenchmark/challenges/__init__.py
index cee8b28ec..68105d854 100644
--- a/benchmark/agbenchmark/challenges/__init__.py
+++ b/benchmark/agbenchmark/challenges/__init__.py
@@ -11,12 +11,16 @@ logger = logging.getLogger(__name__)
 
 def get_challenge_from_source_uri(source_uri: str) -> type[BaseChallenge]:
     from .builtin import BuiltinChallenge
+    from .webarena import WebArenaChallenge
 
     provider_prefix = source_uri.split("/", 1)[0]
 
     if provider_prefix == BuiltinChallenge.SOURCE_URI_PREFIX:
         return BuiltinChallenge.from_source_uri(source_uri)
 
+    if provider_prefix == WebArenaChallenge.SOURCE_URI_PREFIX:
+        return WebArenaChallenge.from_source_uri(source_uri)
+
     raise ValueError(f"Cannot resolve source_uri '{source_uri}'")
 
 
diff --git a/benchmark/agbenchmark/challenges/webarena.py b/benchmark/agbenchmark/challenges/webarena.py
new file mode 100644
index 000000000..aa644520e
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/webarena.py
@@ -0,0 +1,475 @@
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import ClassVar, Iterator, Literal
+
+import pytest
+import requests
+from agent_protocol_client import AgentApi, Step
+from pydantic import BaseModel, validator, ValidationError
+
+from agbenchmark.config import AgentBenchmarkConfig
+from agbenchmark.utils.data_types import Category, EvalResult
+
+from .base import BaseChallenge, ChallengeInfo
+
+logger = logging.getLogger(__name__)
+
+
+EvalType = Literal["string_match", "url_match", "program_html"]
+WebArenaSite = Literal[
+    "gitlab", "map", "reddit", "shopping", "shopping_admin", "wikipedia"
+]
+ReferenceAnswerType = Literal["exact_match", "fuzzy_match", "must_include"]
+
+
+class WebArenaSiteInfo(BaseModel):
+    base_url: str
+    available: bool = True
+    additional_info: str = ""
+    unavailable_reason: str = ""
+
+
+_git_user, _git_password = os.getenv("WEBARENA_GIT_CREDENTIALS", ":").split(":")
+
+site_info_map: dict[WebArenaSite, WebArenaSiteInfo] = {
+    "gitlab": WebArenaSiteInfo(
+        base_url="http://git.junglegym.ai",
+        available=bool(_git_user and _git_password),
+        additional_info=(
+            f"To log in, use the username '{_git_user}' and password '{_git_password}'."
+        ),
+        unavailable_reason=(
+            "WEBARENA_GIT_CREDENTIALS not set (correctly): "
+            f"'{os.getenv('WEBARENA_GIT_CREDENTIALS', '')}', "
+            "should be USERNAME:PASSWORD."
+        ),
+    ),
+    "map": WebArenaSiteInfo(
+        base_url="http://ec2-3-131-244-37.us-east-2.compute.amazonaws.com:3000/"
+    ),
+    "reddit": WebArenaSiteInfo(base_url="http://forum.junglegym.ai"),
+    "shopping": WebArenaSiteInfo(base_url="http://shop.junglegym.ai"),
+    "shopping_admin": WebArenaSiteInfo(
+        base_url="http://cms.junglegym.ai/admin",
+        additional_info="To log in, use the username 'admin' and password 'admin1234'.",
+    ),
+    "wikipedia": WebArenaSiteInfo(base_url="http://wiki.junglegym.ai"),
+}
+
+
+def get_site_url(site: WebArenaSite) -> str:
+    if site not in site_info_map:
+        raise ValueError(f"JungleGym site '{site}' unknown, cannot resolve URL")
+    return site_info_map[site].base_url
+
+
+def resolve_uri(uri: str) -> str:
+    """
+    Resolves URIs with mock hosts, like `__WIKI__/wiki/Octopus`, with the corresponding
+    JungleGym site mirror host.
+    """
+    segments = uri.split("__")
+    if len(segments) > 2 and (site := segments[1]).lower() in site_info_map:
+        return uri.replace(f"__{site}__", get_site_url(site.lower()))  # type: ignore
+    return uri
+
+
+class Eval(ABC):
+    @abstractmethod
+    def evaluate(self, string: str) -> bool:
+        ...
+
+    @property
+    @abstractmethod
+    def description(self) -> str:
+        ...
+
+
+class StringEval(BaseModel, Eval):
+    type: ReferenceAnswerType
+
+
+class ExactStringMatchEval(StringEval):
+    type: Literal["exact_match"] = "exact_match"
+    reference_answer: str
+
+    @property
+    def description(self) -> str:
+        return f"Answer must be '{self.reference_answer}'"
+
+    def evaluate(self, string: str) -> bool:
+        return string == self.reference_answer
+
+
+class FuzzyStringMatchEval(StringEval):
+    type: Literal["fuzzy_match"] = "fuzzy_match"
+    reference_answer: str
+
+    @property
+    def description(self) -> str:
+        return f"Answer must contain something like '{self.reference_answer}'"
+
+    def evaluate(self, string: str) -> bool:
+        # TODO: use LLM for matching (or something else that's flexible/robust)
+        return self.reference_answer.lower() in string.lower()
+
+
+class MustIncludeStringEval(StringEval):
+    type: Literal["must_include"] = "must_include"
+    reference_answer: str
+
+    @property
+    def description(self) -> str:
+        return f"Answer must include '{self.reference_answer}'"
+
+    def evaluate(self, string: str) -> bool:
+        return self.reference_answer.lower() in string.lower()
+
+
+class UrlMatchEval(BaseModel, Eval):
+    url: str
+    """Example: `"__WIKI__/wiki/Octopus"`"""
+
+    @property
+    def description(self) -> str:
+        return f"Agent must navigate to '{self.url}'"
+
+    def evaluate(self, url: str) -> bool:
+        return url == resolve_uri(self.url)
+
+
+class ProgramHtmlEval(BaseModel):
+    url: str
+    locator: str
+    """JavaScript code that returns the value to check"""
+    required_contents: str
+
+    @property
+    def description(self) -> str:
+        return (
+            f"On the webpage {self.url}, "
+            f"`{self.locator}` should contain '{self.required_contents}'"
+        )
+
+    def evaluate(self, selenium_instance) -> bool:
+        result = selenium_instance.execute_script(
+            self.locator or "return document.body.innerHTML;"
+        )
+        return self.required_contents in result
+
+
+_Eval = StringEval | UrlMatchEval | ProgramHtmlEval
+
+
+class WebArenaChallengeSpec(BaseModel):
+    task_id: int
+    sites: list[WebArenaSite]
+    """The sites needed to complete the task"""
+    start_url: str
+    """The full URL at which to start"""
+    start_url_junglegym: str
+    """The JungleGym site (base URL) at which to start"""
+    require_login: bool
+    require_reset: bool
+    storage_state: str | None
+
+    intent: str
+    intent_template: str
+    intent_template_id: int
+    instantiation_dict: dict[str, str | list[str]]
+
+    class EvalSet(BaseModel):
+        class StringMatchEvalSet(BaseModel):
+            exact_match: str | None
+            fuzzy_match: list[str] | None
+            must_include: list[str] | None
+
+        reference_answers: StringMatchEvalSet | None
+        """For string_match eval, a set of criteria to judge the final answer"""
+        reference_answer_raw_annotation: str | None
+        string_note: str | None
+        annotation_note: str | None
+
+        reference_url: str | None
+        """For url_match eval, the last URL that should be visited"""
+        url_note: str | None
+
+        program_html: list[ProgramHtmlEval]
+        """For program_html eval, a list of criteria to judge the site state by"""
+
+        eval_types: list[EvalType]
+
+        @validator("eval_types")
+        def check_eval_parameters(cls, v: list[EvalType], values):
+            if "string_match" in v and not values.get("reference_answers"):
+                raise ValueError("'string_match' eval_type requires reference_answers")
+            if "url_match" in v and not values.get("reference_url"):
+                raise ValueError("'url_match' eval_type requires reference_url")
+            if "program_html" in v and not values.get("program_html"):
+                raise ValueError(
+                    "'program_html' eval_type requires at least one program_html eval"
+                )
+            return v
+
+        @property
+        def evaluators(self) -> list[_Eval]:
+            evaluators: list[_Eval] = []
+            if self.reference_answers:
+                if self.reference_answers.exact_match:
+                    evaluators.append(
+                        ExactStringMatchEval(
+                            reference_answer=self.reference_answers.exact_match
+                        )
+                    )
+                if self.reference_answers.fuzzy_match:
+                    evaluators.extend(
+                        FuzzyStringMatchEval(reference_answer=a)
+                        for a in self.reference_answers.fuzzy_match
+                    )
+                if self.reference_answers.must_include:
+                    evaluators.extend(
+                        MustIncludeStringEval(reference_answer=a)
+                        for a in self.reference_answers.must_include
+                    )
+            if self.reference_url:
+                evaluators.append(UrlMatchEval(url=self.reference_url))
+            evaluators.extend(self.program_html)
+            return evaluators
+
+    eval: EvalSet
+    """Evaluation criteria by which to judge the agent's performance"""
+
+    @property
+    def assignment_for_agent(self):
+        sites = [get_site_url(s) for s in self.sites]
+        nav_constraint = (
+            f"You are ONLY allowed to access URLs in {' and '.join(sites)}."
+        )
+
+        return (
+            f"First of all, go to {self.start_url}. "
+            f"{self.intent.rstrip('.')}.\n"
+            f"{nav_constraint}"
+        )
+
+
+class WebArenaChallenge(BaseChallenge):
+    _spec: ClassVar[WebArenaChallengeSpec]
+
+    SOURCE_URI_PREFIX = "__JUNGLEGYM__/webarena/tasks/"
+    SOURCE_URI_TEMPLATE = f"{SOURCE_URI_PREFIX}{{task_id}}"
+
+    @classmethod
+    def from_source_uri(cls, source_uri: str) -> type["WebArenaChallenge"]:
+        if not source_uri.startswith(cls.SOURCE_URI_PREFIX):
+            raise ValueError(f"Invalid source_uri for WebArenaChallenge: {source_uri}")
+
+        source_url = source_uri.replace(
+            cls.SOURCE_URI_PREFIX,
+            "https://api.junglegym.ai/get_webarena_by_task_id?task_id=",
+        )
+        results = requests.get(source_url).json()["data"]
+        if not results:
+            raise ValueError(f"Could not fetch challenge {source_uri}")
+        return cls.from_challenge_spec(WebArenaChallengeSpec.parse_obj(results[0]))
+
+    @classmethod
+    def from_challenge_spec(
+        cls, spec: WebArenaChallengeSpec
+    ) -> type["WebArenaChallenge"]:
+        challenge_info = ChallengeInfo(
+            eval_id=f"junglegym-webarena-{spec.task_id}",
+            name=f"WebArenaTask_{spec.task_id}",
+            task=spec.assignment_for_agent,
+            category=[
+                Category.GENERALIST,
+                Category.WEB,
+            ],  # TODO: make categories more specific
+            reference_answer=spec.eval.reference_answer_raw_annotation,
+            source_uri=cls.SOURCE_URI_TEMPLATE.format(task_id=spec.task_id),
+        )
+        return type(
+            f"Test{challenge_info.name}",
+            (WebArenaChallenge,),
+            {
+                "info": challenge_info,
+                "_spec": spec,
+            },
+        )
+
+    @classmethod
+    def evaluate_answer(cls, answer: str) -> list[tuple[_Eval, EvalResult]]:
+        results: list[tuple[_Eval, EvalResult]] = []
+        for evaluator in cls._spec.eval.evaluators:
+            if isinstance(evaluator, StringEval):  # string_match
+                results.append(
+                    (
+                        evaluator,
+                        EvalResult(
+                            result=answer,
+                            result_source="step_output",
+                            score=evaluator.evaluate(answer),
+                            passed=evaluator.evaluate(answer),
+                        ),
+                    )
+                )
+        return results
+
+    @classmethod
+    def evaluate_step_result(cls, step: Step) -> list[tuple[_Eval, EvalResult]]:
+        assert step.output
+        eval_results = cls.evaluate_answer(step.output)
+        for eval in cls._spec.eval.evaluators:
+            if isinstance(eval, UrlMatchEval):
+                passed = resolve_uri(eval.url) in step.output  # HACK: url_match bodge
+                eval_results.append(
+                    (
+                        eval,
+                        EvalResult(
+                            result=step.output,
+                            result_source="step_output",
+                            score=1.0 if passed else 0.0,
+                            passed=passed,
+                        ),
+                    )
+                )
+            # TODO: add support for program_html evals
+        return eval_results
+
+    @classmethod
+    async def evaluate_task_state(
+        cls, agent: AgentApi, task_id: str
+    ) -> list[EvalResult]:
+        steps: list[Step] = (await agent.list_agent_task_steps(task_id)).steps
+
+        eval_results_per_step = [cls.evaluate_step_result(step) for step in steps]
+        # Get the column aggregate (highest scored EvalResult for each Eval)
+        # from the matrix of EvalResults per step.
+        return [
+            max(step_results_for_eval, key=lambda r: r[1].score)[1]
+            for step_results_for_eval in zip(*eval_results_per_step)
+        ]
+
+    @pytest.mark.asyncio
+    async def test_method(
+        self, config: AgentBenchmarkConfig, request: pytest.FixtureRequest
+    ) -> None:
+        if os.environ.get("HELICONE_API_KEY"):
+            from helicone.lock import HeliconeLockManager
+
+            HeliconeLockManager.write_custom_property("challenge", self.info.name)
+
+        timeout = 120
+        if request.config.getoption("--nc"):
+            timeout = 100000
+        elif cutoff := request.config.getoption("--cutoff"):
+            timeout = int(cutoff)
+
+        timed_out = None
+        eval_results_per_step: list[list[tuple[_Eval, EvalResult]]] = []
+        try:
+            async for step in self.run_challenge(config, timeout):
+                if not step.output:
+                    logger.warn(f"Step has no output: {step}")
+                    continue
+                step_eval_results = self.evaluate_step_result(step)
+                logger.debug(f"Intermediary results: {step_eval_results}")
+                eval_results_per_step.append(step_eval_results)
+                if step.is_last:
+                    request.node.user_properties.append(
+                        (
+                            "answers",
+                            step.output
+                            if request.config.getoption("--keep-answers")
+                            else None,
+                        )
+                    )
+            timed_out = False
+        except TimeoutError:
+            timed_out = True
+        request.node.user_properties.append(("timed_out", timed_out))
+
+        # Get the column aggregate (highest score for each Eval)
+        # from the matrix of EvalResults per step.
+        evals_results = [
+            max(step_results_for_eval, key=lambda r: r[1].score)
+            for step_results_for_eval in zip(*eval_results_per_step)
+        ]
+
+        if not evals_results:
+            if timed_out:
+                raise TimeoutError("Timed out, no results to evaluate")
+            else:
+                raise ValueError("No results to evaluate")
+
+        request.node.user_properties.append(
+            ("scores", [r[1].score for r in evals_results])
+        )
+
+        # FIXME: arbitrary threshold
+        assert all(r[1].score > 0.9 for r in evals_results), (
+            "Scores insufficient:\n\n"
+            if not timed_out
+            else "Timed out; scores insufficient:\n\n"
+        ) + "\n".join(f"{repr(r[0])}\n  -> {repr(r[1])}" for r in evals_results)
+
+
+def load_webarena_challenges() -> Iterator[type[WebArenaChallenge]]:
+    logger.info("Loading WebArena challenges...")
+
+    for site, info in site_info_map.items():
+        if not info.available:
+            logger.warning(
+                f"JungleGym site '{site}' is not available: {info.unavailable_reason} "
+                "Skipping all challenges which use this site."
+            )
+
+    # response = requests.get("https://api.junglegym.ai/get_full_webarena_dataset")
+    # challenge_dicts = response.json()["data"]
+
+    # Until the full WebArena challenge set is supported, use a hand-picked selection
+    import json
+    from pathlib import Path
+
+    challenge_dicts = json.loads(
+        (Path(__file__).parent / "webarena_selection.json").read_bytes()
+    )
+
+    logger.debug(
+        "Fetched WebArena dataset. "
+        f"Constructing {len(challenge_dicts)} WebArenaChallenges..."
+    )
+    loaded = 0
+    failed = 0
+    skipped = 0
+    for entry in challenge_dicts:
+        try:
+            challenge_spec = WebArenaChallengeSpec.parse_obj(entry)
+            for site in challenge_spec.sites:
+                site_info = site_info_map.get(site)
+                if site_info is None:
+                    logger.warning(
+                        f"WebArena task {challenge_spec.task_id} requires unknown site "
+                        f"'{site}'; skipping..."
+                    )
+                    break
+                if not site_info.available:
+                    logger.debug(
+                        f"WebArena task {challenge_spec.task_id} requires unavailable "
+                        f"site '{site}'; skipping..."
+                    )
+                    break
+            else:
+                yield WebArenaChallenge.from_challenge_spec(challenge_spec)
+                loaded += 1
+                continue
+            skipped += 1
+        except ValidationError as e:
+            failed += 1
+            logger.warning(f"Error validating WebArena challenge entry: {entry}")
+            logger.warning(f"Error details: {e}")
+    logger.info(
+        "Loading WebArena challenges complete: "
+        f"loaded {loaded}, skipped {skipped}. {failed} challenge failed to load."
+    )
diff --git a/benchmark/agbenchmark/challenges/webarena_selection.json b/benchmark/agbenchmark/challenges/webarena_selection.json
new file mode 100644
index 000000000..e35a27d37
--- /dev/null
+++ b/benchmark/agbenchmark/challenges/webarena_selection.json
@@ -0,0 +1,523 @@
+[
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 0,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "What is the top-{{n}} best-selling product in {{year}}",
+        "instantiation_dict": {
+            "n": 1,
+            "year": 2022
+        },
+        "intent": "What is the top-1 best-selling product in 2022",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "exact_match": "Quest Lumaflex™ Band"
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "Quest Lumaflex™ Band"
+        },
+        "intent_template_id": 279,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 4,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "What are the top-{{n}} best-selling product in {{period}}",
+        "instantiation_dict": {
+            "n": 3,
+            "period": "Jan 2023"
+        },
+        "intent": "What are the top-3 best-selling product in Jan 2023",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "Impulse Duffle",
+                    "Overnight Duffle",
+                    "Hawkeye Yoga Short-32-Blue"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "Impulse Duffle, Overnight Duffle, Hawkeye Yoga Short-32-Blue"
+        },
+        "intent_template_id": 279,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 6,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "What are the top-{{n}} best-selling product in {{year}}",
+        "instantiation_dict": {
+            "n": 5,
+            "year": 2023
+        },
+        "intent": "What are the top-5 best-selling product in 2023",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "Sprite Yoga Strap 6 foot",
+                    "Overnight Duffle",
+                    "Ida Workout Parachute Pant-29-Purple",
+                    "Hawkeye Yoga Short-32-Blue",
+                    "Sprite Stasis Ball 65 cm"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "Sprite Yoga Strap 6 foot, Overnight Duffle, Ida Workout Parachute Pant-29-Purple, Hawkeye Yoga Short-32-Blue, Sprite Stasis Ball 65 cm"
+        },
+        "intent_template_id": 279,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 11,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"",
+        "instantiation_dict": {
+            "term": "disappointed"
+        },
+        "intent": "Tell me the the number of reviews that our store received by far that mention term \"disappointed\"",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "6"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "6"
+        },
+        "intent_template_id": 288,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping"
+        ],
+        "task_id": 21,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_state.json",
+        "start_url": "http://shop.junglegym.ai/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html",
+        "geolocation": "NaN",
+        "intent_template": "List out reviewers, if exist, who mention about {{description}}",
+        "instantiation_dict": {
+            "description": "ear cups being small"
+        },
+        "intent": "List out reviewers, if exist, who mention about ear cups being small",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "Joseph Brzezinski",
+                    "Catso",
+                    "Dibbins",
+                    "Anglebert Dinkherhump",
+                    "Michelle Davis"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, Michelle Davis"
+        },
+        "intent_template_id": 222,
+        "string_note": null,
+        "start_url_junglegym": "http://shop.junglegym.ai"
+    },
+    {
+        "sites": [
+            "reddit"
+        ],
+        "task_id": 27,
+        "require_login": true,
+        "storage_state": "./.auth/reddit_state.json",
+        "start_url": "http://forum.junglegym.ai",
+        "geolocation": "NaN",
+        "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.",
+        "instantiation_dict": {
+            "forum": "Showerthoughts"
+        },
+        "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "0"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "0"
+        },
+        "intent_template_id": 33,
+        "string_note": null,
+        "start_url_junglegym": "http://forum.junglegym.ai"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 43,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "List the top {{n}} search terms in my store",
+        "instantiation_dict": {
+            "n": "3"
+        },
+        "intent": "List the top 3 search terms in my store",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "hollister",
+                    "Joust Bag",
+                    "Antonia Race Tank"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "hollister, Joust Bag, Antonia Race Tank"
+        },
+        "intent_template_id": 285,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 77,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?",
+        "instantiation_dict": {
+            "status": "Pending"
+        },
+        "intent": "What is the total count of Pending reviews amongst all the reviews?",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "5"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "5"
+        },
+        "intent_template_id": 277,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 95,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "Telll me the grand total of invoice {{id}}.",
+        "instantiation_dict": {
+            "id": "000000002"
+        },
+        "intent": "Telll me the grand total of invoice 000000002.",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "39.64"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "$39.64"
+        },
+        "intent_template_id": 274,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 107,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format",
+        "instantiation_dict": {
+            "period": "from May to December 2022"
+        },
+        "intent": "Presents the monthly count of successful orders from May to December 2022 in MM:COUNT format",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "fuzzy_match": [
+                    "May: 8 orders",
+                    "June: 13 orders",
+                    "July: 9 orders",
+                    "August: 8 orders",
+                    "Sepetember: 10 orders",
+                    "October: 4 orders",
+                    "November: 5 orders",
+                    "December: 10 orders"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders  "
+        },
+        "intent_template_id": 270,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping_admin"
+        ],
+        "task_id": 112,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_admin_state.json",
+        "start_url": "http://cms.junglegym.ai/admin",
+        "geolocation": "NaN",
+        "intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?",
+        "instantiation_dict": {
+            "product": "Circe fleece"
+        },
+        "intent": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "exact_match": "Hannah Lim"
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "Hannah Lim"
+        },
+        "intent_template_id": 245,
+        "string_note": null,
+        "start_url_junglegym": "http://cms.junglegym.ai/admin"
+    },
+    {
+        "sites": [
+            "shopping"
+        ],
+        "task_id": 124,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_state.json",
+        "start_url": "http://shop.junglegym.ai",
+        "geolocation": "NaN",
+        "intent_template": "What is the price range of {{product}} in the One Stop Market?",
+        "instantiation_dict": {
+            "product": "wireless earphone"
+        },
+        "intent": "What is the price range of wireless earphone in the One Stop Market?",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "0.14",
+                    "745.00"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "$0.14 - $745.00"
+        },
+        "intent_template_id": 159,
+        "string_note": null,
+        "start_url_junglegym": "http://shop.junglegym.ai"
+    },
+    {
+        "sites": [
+            "gitlab"
+        ],
+        "task_id": 134,
+        "require_login": true,
+        "storage_state": "./.auth/gitlab_state.json",
+        "start_url": "http://git.junglegym.ai",
+        "geolocation": "NaN",
+        "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?",
+        "instantiation_dict": {
+            "user": "kilian",
+            "repo": "a11yproject",
+            "date": "3/1/2023"
+        },
+        "intent": "How many commits did kilian make to a11yproject on 3/1/2023?",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "0"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "0"
+        },
+        "intent_template_id": 322,
+        "string_note": null,
+        "start_url_junglegym": "http://git.junglegym.ai"
+    },
+    {
+        "sites": [
+            "gitlab"
+        ],
+        "task_id": 136,
+        "require_login": true,
+        "storage_state": "./.auth/gitlab_state.json",
+        "start_url": "http://git.junglegym.ai",
+        "geolocation": "NaN",
+        "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?",
+        "instantiation_dict": {
+            "user": "Steven Woodson",
+            "repo": "a11y-webring.club",
+            "date": "2/6/2023"
+        },
+        "intent": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "5"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "5"
+        },
+        "intent_template_id": 322,
+        "string_note": null,
+        "start_url_junglegym": "http://git.junglegym.ai"
+    },
+    {
+        "sites": [
+            "shopping"
+        ],
+        "task_id": 163,
+        "require_login": true,
+        "storage_state": "./.auth/shopping_state.json",
+        "start_url": "http://shop.junglegym.ai/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html",
+        "geolocation": "NaN",
+        "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.",
+        "instantiation_dict": {},
+        "intent": "What are the main criticisms of this product? Please extract the relevant sentences.",
+        "require_reset": false,
+        "eval": {
+            "eval_types": [
+                "string_match"
+            ],
+            "reference_answers": {
+                "must_include": [
+                    "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+                    "The memory card is kind of slow on games and downloads",
+                    "No original packaging It's used and the previous owners data has not been erased",
+                    "The product is a legit sony hardware that have been owned by someone else before",
+                    "The media could not be loaded",
+                    "I could not format the card so I wasn’t able to use it for my VITA"
+                ]
+            },
+            "reference_url": "",
+            "program_html": [],
+            "string_note": "",
+            "reference_answer_raw_annotation": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. The product is a legit sony hardware that have been owned by someone else before The media could not be loaded.  I could not format the card so I wasn’t able to use it for my VITA"
+        },
+        "intent_template_id": 136,
+        "string_note": null,
+        "start_url_junglegym": "http://shop.junglegym.ai"
+    }
+]
diff --git a/benchmark/agbenchmark/generate_test.py b/benchmark/agbenchmark/generate_test.py
index 20a352a50..5bc41971e 100644
--- a/benchmark/agbenchmark/generate_test.py
+++ b/benchmark/agbenchmark/generate_test.py
@@ -7,15 +7,17 @@ classes in the module that conform to the `Test*` pattern are collected.
 
 import importlib
 import logging
+from itertools import chain
 
 from agbenchmark.challenges.builtin import load_builtin_challenges
+from agbenchmark.challenges.webarena import load_webarena_challenges
 
 logger = logging.getLogger(__name__)
 
 DATA_CATEGORY = {}
 
 # Load challenges and attach them to this module
-for challenge in load_builtin_challenges():
+for challenge in chain(load_builtin_challenges(), load_webarena_challenges()):
     # Attach the Challenge class to this module so it can be discovered by pytest
     module = importlib.import_module(__name__)
     setattr(module, challenge.__name__, challenge)
author	Reinier van der Leer <pwuts@agpt.co>	2024-01-19 20:34:04 +0100
committer	GitHub <noreply@github.com>	2024-01-19 20:34:04 +0100
commit	488f40a20fbd4c5929611ed97ff6782c938b481d (patch)
tree	1db933c2c0b544e72609b95f0f7b1ef6f8c3ea9d
parent	fix(benchmark/report): Fix and clean up logic in `update_challenges_already_b... (diff)
download	Auto-GPT-488f40a20fbd4c5929611ed97ff6782c938b481d.tar.gz Auto-GPT-488f40a20fbd4c5929611ed97ff6782c938b481d.tar.bz2 Auto-GPT-488f40a20fbd4c5929611ed97ff6782c938b481d.zip