modelscope · zzhangpurdue · Jun 3, 2024 · May 27, 2024 · May 29, 2024 · May 29, 2024
diff --git a/apps/mobile_agent/README.md b/apps/mobile_agent/README.md
@@ -0,0 +1,26 @@
+# Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration
+
+## 🔧Getting Started
+
+### Installation
+```
+pip install -r requirements.txt
+```
+
+### Preparation for Connecting Mobile Device
+1. Download the [Android Debug Bridge](https://developer.android.com/tools/releases/platform-tools?hl=en).
+2. Turn on the ADB debugging switch on your Android phone, it needs to be turned on in the developer options first.
+3. Connect your phone to the computer with a data cable and select "Transfer files".
+4. Test your ADB environment as follow: ```/path/to/adb devices```. If the connected devices are displayed, the preparation is complete.
+5. If you are using a MAC or Linux system, make sure to turn on adb permissions as follow: ```sudo chmod +x /path/to/adb```
+6. If you are using Windows system, the path will be ```xx/xx/adb.exe```
+
+
+
+### Run
+
+The related args to run demo include:
+* `--adb_path`: The path to debug with your adb.
+* `--openai_api_key`: The OpenAI key to call llm.
+* `--dashscope_api_key`: The Dashscope key to call qwen-vl.
+* `--instruction`: Your instruction.
diff --git a/apps/mobile_agent/requirements.txt b/apps/mobile_agent/requirements.txt
@@ -0,0 +1,17 @@
+git+https://github.com/openai/CLIP.git
+keras==2.9.0
+matplotlib
+modelscope
+opencv-python
+pyclipper
+pycocotools
+SentencePiece
+shapely
+supervision
+TensorFlow==2.9.1
+tf_keras
+tf_slim
+timm
+torch
+torchvision
+transformers
diff --git a/apps/mobile_agent/run.py b/apps/mobile_agent/run.py
@@ -0,0 +1,39 @@
+import argparse
+import os
+
+from modelscope_agent.agents.mobile_agent_v2 import MobileAgentV2
+from modelscope_agent.environment import ADBEnvironment
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--adb_path', type=str, default='./adb/adb')
+parser.add_argument(
+    '--openai_api_key', type=str, default=os.getenv('OPENAI_API_KEY'))
+parser.add_argument(
+    '--dashscope_api_key', type=str, default=os.getenv('DASHSCOPE_API_KEY'))
+parser.add_argument(
+    '--instruction', type=str, default="Tell me today's weathers")
+
+args = parser.parse_args()
+
+adb_path = args.adb_path
+
+os.environ['OPENAI_API_KEY'] = args.openai_api_key
+# used to calling qwen-vl for description of icon during perception
+os.environ['DASHSCOPE_API_KEY'] = args.dashscope_api_key
+
+instruction = args.instruction
+
+llm_config = {
+    'model': 'gpt-4o',
+    'model_server': 'openai',
+}
+
+env = ADBEnvironment(adb_path)
+
+agent = MobileAgentV2(
+    env=env,
+    llm_decision=llm_config,
+    llm_planner=llm_config,
+    llm_reflect=llm_config)
+
+agent.run(instruction)
diff --git a/modelscope_agent/agent.py b/modelscope_agent/agent.py
@@ -5,7 +5,6 @@
 from modelscope_agent.llm.base import BaseChatModel
 from modelscope_agent.tools.base import (TOOL_REGISTRY, BaseTool,
                                          ToolServiceProxy)
-from modelscope_agent.utils.logger import agent_logger as logger
 from modelscope_agent.utils.utils import has_chinese_chars
 
 
@@ -193,11 +192,6 @@ def _parse_image_url(self, image_url: List[Union[str, Dict]],
                          messages: List[Dict]) -> List[Dict]:
 
         assert len(messages) > 0
-        if self.llm.model not in ['gpt-4o', 'gpt-4-turbo']:
-            logger.warning(
-                f'currently only gp4-4o and gpt-4-turbo support image_url, but the model is {self.llm.model}'
-            )
-            return messages
 
         if isinstance(image_url[0], str):
             image_url = [{'url': url} for url in image_url]

diff --git a/modelscope_agent/agents/__init__.py b/modelscope_agent/agents/__init__.py
@@ -1,4 +1,5 @@
 from .agent_builder import AgentBuilder
 from .gen_keyword import GenKeyword
+from .mobile_agent_v2 import MobileAgentV2
 from .multi_role_play import MultiRolePlay
 from .role_play import RolePlay
diff --git a/modelscope_agent/agents/mobile_agent_v2/__init__.py b/modelscope_agent/agents/mobile_agent_v2/__init__.py
@@ -0,0 +1 @@
+from .mobile_agent_v2 import MobileAgentV2
diff --git a/modelscope_agent/agents/mobile_agent_v2/mobile_agent_v2.py b/modelscope_agent/agents/mobile_agent_v2/mobile_agent_v2.py
@@ -0,0 +1,204 @@
+import copy
+import os
+from typing import Dict, List, Optional, Tuple, Union
+
+import json
+from modelscope_agent import Agent
+from modelscope_agent.environment import ADBEnvironment
+from modelscope_agent.llm import get_chat_model
+from modelscope_agent.llm.base import BaseChatModel
+from modelscope_agent.utils.logger import agent_logger as logger
+
+from .prompt import (get_action_prompt, get_memory_prompt, get_process_prompt,
+                     get_reflect_prompt, get_system_prompt)
+
+
+class MobileAgentV2(Agent):
+
+    def __init__(self,
+                 env: ADBEnvironment,
+                 function_list: Optional[List[Union[str, Dict]]] = None,
+                 llm_planner: Optional[Union[Dict, BaseChatModel]] = None,
+                 llm_decision: Optional[Union[Dict, BaseChatModel]] = None,
+                 llm_reflect: Optional[Union[Dict, BaseChatModel]] = None,
+                 storage_path: Optional[str] = None,
+                 **kwargs):
+
+        self.env = env
+
+        if isinstance(llm_planner, Dict):
+            self.llm_config_planner = llm_planner
+            self.llm_planner = get_chat_model(**self.llm_config_planner)
+        else:
+            self.llm_planner = llm_planner
+
+        if isinstance(llm_decision, Dict):
+            self.llm_config_decision = llm_decision
+            self.llm_decision = get_chat_model(**self.llm_config_decision)
+        else:
+            self.llm_decision = llm_decision
+
+        if isinstance(llm_reflect, Dict):
+            self.llm_config_reflect = llm_reflect
+            self.llm_reflect = get_chat_model(**self.llm_config_reflect)
+        else:
+            self.llm_reflect = llm_reflect
+        self.stream = True
+
+        self.function_list = []
+        self.function_map = {}
+        if function_list:
+            for function in function_list:
+                self._register_tool(function)
+
+        self.storage_path = storage_path
+        self.mem = None
+
+    def _run(self, user_query, **kwargs):
+        step = 0
+
+        thought_history = []
+        summary_history = []
+        action_history = []
+        add_info = "If you want to tap an icon of an app, use the action \"Open app\""
+        summary = ''
+        action = ''
+        completed_requirements = ''
+        memory = ''
+
+        error_flag = False
+
+        logger.info('Start running mobile agent')
+        while True:
+            step += 1
+            this_results = {}
+            logger.info(f'Oberserve the environment: Step {step}')
+            perception_infos, width, height, keyboard, screenshot_file = self.env.observe(
+            )
+
+            # decision
+            prompt_action = get_action_prompt(user_query, perception_infos,
+                                              width, height, keyboard,
+                                              summary_history, action_history,
+                                              summary, action, add_info,
+                                              error_flag,
+                                              completed_requirements, memory)
+            system_prompt_decision = get_system_prompt('decision')
+            messages = [{'role': 'system', 'content': system_prompt_decision}]
+            messages.append({'role': 'user', 'content': prompt_action})
+
+            self._parse_image_url([screenshot_file], messages)
+
+            logger.info(f'Call decision agent: Step {step}')
+            output_decision = self.llm_decision.chat(messages=messages)
+            # this_results['decision'] = output_action
+
+            thought = output_decision.split(
+                '### Thought ###')[-1].split('### Action ###')[0].replace(
+                    '\n', ' ').replace(':', '').replace('  ', ' ').strip()
+            summary = output_decision.split('### Operation ###')[-1].replace(
+                '\n', ' ').replace('  ', ' ').strip()
+            action = output_decision.split('### Action ###')[-1].split(
+                '### Operation ###')[0].replace('\n',
+                                                ' ').replace('  ',
+                                                             ' ').strip()
+
+            messages.append({'role': 'assistant', 'content': output_decision})
+
+            prompt_memory = get_memory_prompt()
+
+            messages.append({'role': 'user', 'content': prompt_memory})
+
+            logger.info(f'Call decision agent with action: Step {step}')
+            output_memory = self.llm_decision.chat(messages=messages)
+
+            messages.append({'role': 'assistant', 'content': output_memory})
+
+            output_memory = output_memory.split('### Important content ###')[
+                -1].split('\n\n')[0].strip() + '\n'
+            if 'None' not in output_memory and output_memory not in memory:
+                memory += output_memory
+                this_results['memory'] = output_memory
+
+            print_status(messages)
+
+            if self.env.act(action):
+                break
+
+            last_perception_infos = copy.deepcopy(perception_infos)
+            last_keyboard = keyboard
+            last_screenshot_file = screenshot_file
+
+            logger.info(f'Observe the environment before reflect: Step {step}')
+            perception_infos, width, height, keyboard, screenshot_file = self.env.observe(
+            )
+
+            # reflect
+            prompt_reflect = get_reflect_prompt(
+                user_query, last_perception_infos, perception_infos, width,
+                height, last_keyboard, keyboard, summary, action, add_info)
+            system_prompt_reflect = get_system_prompt('reflect')
+            messages = [{'role': 'system', 'content': system_prompt_reflect}]
+
+            messages.append({'role': 'user', 'content': prompt_reflect})
+
+            self._parse_image_url([last_screenshot_file, screenshot_file],
+                                  messages)
+
+            logger.info(f'Call reflect agent: Step {step}')
+            output_reflect = self.llm_reflect.chat(messages=messages)
+            this_results['reflect'] = output_reflect
+            reflect = output_reflect.split('### Answer ###')[-1].replace(
+                '\n', ' ').strip()
+            messages.append({'role': 'assistant', 'content': output_reflect})
+            print_status(messages)
+
+            if 'A' in reflect:
+                thought_history.append(thought)
+                summary_history.append(summary)
+                action_history.append(action)
+
+                prompt_memory = get_process_prompt(user_query, thought_history,
+                                                   summary_history,
+                                                   action_history,
+                                                   completed_requirements,
+                                                   add_info)
+                system_prompy_plan = get_system_prompt('plan')
+                messages = [{'role': 'system', 'content': system_prompy_plan}]
+                messages.append({'role': 'user', 'content': prompt_memory})
+
+                logger.info(f'Call planner agent: Step {step}')
+                output_memory = self.llm_planner.chat(messages=messages)
+
+                messages.append({
+                    'role': 'assistant',
+                    'content': output_memory
+                })
+                print_status(messages)
+
+                completed_requirements = output_memory.split(
+                    '### Completed contents ###')[-1].replace('\n',
+                                                              ' ').strip()
+                this_results['process'] = output_memory
+
+                error_flag = False
+
+            elif 'B' in reflect:
+                error_flag = True
+                self.env.act('Back')
+
+            elif 'C' in reflect:
+                error_flag = True
+
+
+def print_status(chat_history):
+    print('*' * 100)
+    for chat in chat_history:
+        print('role:', chat['role'])
+        content = chat['content']
+        if isinstance(content, str):
+            print(content)
+        else:
+            print(content[0]['text'] + '<image>' * (len(content[1]) - 1)
+                  + '\n')
+    print('*' * 100)