update benchmark

WHALEEYE · WHALEEYE · commit bfd778d4734c · 2024-10-30T20:51:51.000-05:00
diff --git a/gui/gui_experiment.py b/gui/gui_experiment.py
@@ -0,0 +1,49 @@
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2024 @ CAMEL-AI.org. All Rights Reserved. ===========
+from pathlib import Path
+from typing import Literal
+
+from crab import AgentPolicy, Benchmark, Experiment, MessageType
+
+
+class GuiExperiment(Experiment):
+    def __init__(
+        self,
+        benchmark: Benchmark,
+        task_id: str,
+        agent_policy: AgentPolicy | Literal["human"],
+        log_dir: Path | None = None,
+    ) -> None:
+        super().__init__(benchmark, task_id, agent_policy, log_dir)
+
+    def get_prompt(self):
+        observation, ob_prompt = self.benchmark.observe_with_prompt()
+
+        # construct prompt
+        result_prompt = {}
+        for env in ob_prompt:
+            if env == "root":
+                continue
+            screenshot = observation[env]["screenshot"]
+            marked_screenshot, _ = ob_prompt[env]["screenshot"]
+            result_prompt[env] = [
+                (f"Here is the current screenshot of {env}:", MessageType.TEXT),
+                (screenshot, MessageType.IMAGE_JPG_BASE64),
+                (
+                    f"Here is the screenshot with element labels of {env}:",
+                    MessageType.TEXT,
+                ),
+                (marked_screenshot, MessageType.IMAGE_JPG_BASE64),
+            ]
+        return result_prompt
diff --git a/gui/main.py b/gui/main.py
@@ -17,9 +17,9 @@
 
 import customtkinter as ctk
 
-from crab import Experiment
 from crab.agents.backend_models import ClaudeModel, GeminiModel, OpenAIModel
 from crab.agents.policies import SingleAgentPolicy
+from gui.gui_experiment import GuiExperiment
 from gui.utils import get_benchmark
 
 warnings.filterwarnings("ignore")
@@ -58,7 +58,7 @@ def assign_task():
 
     task_id = str(uuid4())
     benchmark = get_benchmark(task_id, task_description)
-    experiment = Experiment(
+    experiment = GuiExperiment(
         benchmark=benchmark,
         task_id=task_id,
         agent_policy=agent_policy,
@@ -82,8 +82,6 @@ def display_message(message, sender="user"):
 
 
 if __name__ == "__main__":
-    # TODO: Handle JSON decode error from environment action endpoint and
-    #  display model response in GUI
     log_dir = (Path(__file__).parent / "logs").resolve()
 
     ctk.set_appearance_mode("System")