diff --git a/apps/mobile_agent/README.md b/apps/mobile_agent/README.md
new file mode 100644
index 00000000..4cef3f2a
--- /dev/null
+++ b/apps/mobile_agent/README.md
@@ -0,0 +1,26 @@
+# Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration
+
+## 🔧Getting Started
+
+### Installation
+```
+pip install -r requirements.txt
+```
+
+### Preparation for Connecting Mobile Device
+1. Download the [Android Debug Bridge](https://developer.android.com/tools/releases/platform-tools?hl=en).
+2. Turn on the ADB debugging switch on your Android phone, it needs to be turned on in the developer options first.
+3. Connect your phone to the computer with a data cable and select "Transfer files".
+4. Test your ADB environment as follow: ```/path/to/adb devices```. If the connected devices are displayed, the preparation is complete.
+5. If you are using a MAC or Linux system, make sure to turn on adb permissions as follow: ```sudo chmod +x /path/to/adb```
+6. If you are using Windows system, the path will be ```xx/xx/adb.exe```
+
+
+
+### Run
+
+The related args to run demo include:
+* `--adb_path`: The path to debug with your adb.
+* `--openai_api_key`: The OpenAI key to call llm.
+* `--dashscope_api_key`: The Dashscope key to call qwen-vl.
+* `--instruction`: Your instruction.
diff --git a/apps/mobile_agent/requirements.txt b/apps/mobile_agent/requirements.txt
new file mode 100644
index 00000000..1fdd8d84
--- /dev/null
+++ b/apps/mobile_agent/requirements.txt
@@ -0,0 +1,17 @@
+git+https://github.com/openai/CLIP.git
+keras==2.9.0
+matplotlib
+modelscope
+opencv-python
+pyclipper
+pycocotools
+SentencePiece
+shapely
+supervision
+TensorFlow==2.9.1
+tf_keras
+tf_slim
+timm
+torch
+torchvision
+transformers
diff --git a/apps/mobile_agent/run.py b/apps/mobile_agent/run.py
new file mode 100644
index 00000000..ad300d81
--- /dev/null
+++ b/apps/mobile_agent/run.py
@@ -0,0 +1,39 @@
+import argparse
+import os
+
+from modelscope_agent.agents.mobile_agent_v2 import MobileAgentV2
+from modelscope_agent.environment import ADBEnvironment
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--adb_path', type=str, default='./adb/adb')
+parser.add_argument(
+    '--openai_api_key', type=str, default=os.getenv('OPENAI_API_KEY'))
+parser.add_argument(
+    '--dashscope_api_key', type=str, default=os.getenv('DASHSCOPE_API_KEY'))
+parser.add_argument(
+    '--instruction', type=str, default="Tell me today's weathers")
+
+args = parser.parse_args()
+
+adb_path = args.adb_path
+
+os.environ['OPENAI_API_KEY'] = args.openai_api_key
+# used to calling qwen-vl for description of icon during perception
+os.environ['DASHSCOPE_API_KEY'] = args.dashscope_api_key
+
+instruction = args.instruction
+
+llm_config = {
+    'model': 'gpt-4o',
+    'model_server': 'openai',
+}
+
+env = ADBEnvironment(adb_path)
+
+agent = MobileAgentV2(
+    env=env,
+    llm_decision=llm_config,
+    llm_planner=llm_config,
+    llm_reflect=llm_config)
+
+agent.run(instruction)
diff --git a/modelscope_agent/agent.py b/modelscope_agent/agent.py
index 292893d0..08d134b6 100644
--- a/modelscope_agent/agent.py
+++ b/modelscope_agent/agent.py
@@ -5,7 +5,6 @@
 from modelscope_agent.llm.base import BaseChatModel
 from modelscope_agent.tools.base import (TOOL_REGISTRY, BaseTool,
                                          ToolServiceProxy)
-from modelscope_agent.utils.logger import agent_logger as logger
 from modelscope_agent.utils.utils import has_chinese_chars
 
 
@@ -193,11 +192,6 @@ def _parse_image_url(self, image_url: List[Union[str, Dict]],
                          messages: List[Dict]) -> List[Dict]:
 
         assert len(messages) > 0
-        if self.llm.model not in ['gpt-4o', 'gpt-4-turbo']:
-            logger.warning(
-                f'currently only gp4-4o and gpt-4-turbo support image_url, but the model is {self.llm.model}'
-            )
-            return messages
 
         if isinstance(image_url[0], str):
             image_url = [{'url': url} for url in image_url]
diff --git a/modelscope_agent/agents/__init__.py b/modelscope_agent/agents/__init__.py
index 9dc09d55..e4027a9a 100644
--- a/modelscope_agent/agents/__init__.py
+++ b/modelscope_agent/agents/__init__.py
@@ -1,4 +1,5 @@
 from .agent_builder import AgentBuilder
 from .gen_keyword import GenKeyword
+from .mobile_agent_v2 import MobileAgentV2
 from .multi_role_play import MultiRolePlay
 from .role_play import RolePlay
diff --git a/modelscope_agent/agents/mobile_agent_v2/__init__.py b/modelscope_agent/agents/mobile_agent_v2/__init__.py
new file mode 100644
index 00000000..924f67fe
--- /dev/null
+++ b/modelscope_agent/agents/mobile_agent_v2/__init__.py
@@ -0,0 +1 @@
+from .mobile_agent_v2 import MobileAgentV2
diff --git a/modelscope_agent/agents/mobile_agent_v2/mobile_agent_v2.py b/modelscope_agent/agents/mobile_agent_v2/mobile_agent_v2.py
new file mode 100644
index 00000000..5551be43
--- /dev/null
+++ b/modelscope_agent/agents/mobile_agent_v2/mobile_agent_v2.py
@@ -0,0 +1,204 @@
+import copy
+import os
+from typing import Dict, List, Optional, Tuple, Union
+
+import json
+from modelscope_agent import Agent
+from modelscope_agent.environment import ADBEnvironment
+from modelscope_agent.llm import get_chat_model
+from modelscope_agent.llm.base import BaseChatModel
+from modelscope_agent.utils.logger import agent_logger as logger
+
+from .prompt import (get_action_prompt, get_memory_prompt, get_process_prompt,
+                     get_reflect_prompt, get_system_prompt)
+
+
+class MobileAgentV2(Agent):
+
+    def __init__(self,
+                 env: ADBEnvironment,
+                 function_list: Optional[List[Union[str, Dict]]] = None,
+                 llm_planner: Optional[Union[Dict, BaseChatModel]] = None,
+                 llm_decision: Optional[Union[Dict, BaseChatModel]] = None,
+                 llm_reflect: Optional[Union[Dict, BaseChatModel]] = None,
+                 storage_path: Optional[str] = None,
+                 **kwargs):
+
+        self.env = env
+
+        if isinstance(llm_planner, Dict):
+            self.llm_config_planner = llm_planner
+            self.llm_planner = get_chat_model(**self.llm_config_planner)
+        else:
+            self.llm_planner = llm_planner
+
+        if isinstance(llm_decision, Dict):
+            self.llm_config_decision = llm_decision
+            self.llm_decision = get_chat_model(**self.llm_config_decision)
+        else:
+            self.llm_decision = llm_decision
+
+        if isinstance(llm_reflect, Dict):
+            self.llm_config_reflect = llm_reflect
+            self.llm_reflect = get_chat_model(**self.llm_config_reflect)
+        else:
+            self.llm_reflect = llm_reflect
+        self.stream = True
+
+        self.function_list = []
+        self.function_map = {}
+        if function_list:
+            for function in function_list:
+                self._register_tool(function)
+
+        self.storage_path = storage_path
+        self.mem = None
+
+    def _run(self, user_query, **kwargs):
+        step = 0
+
+        thought_history = []
+        summary_history = []
+        action_history = []
+        add_info = "If you want to tap an icon of an app, use the action \"Open app\""
+        summary = ''
+        action = ''
+        completed_requirements = ''
+        memory = ''
+
+        error_flag = False
+
+        logger.info('Start running mobile agent')
+        while True:
+            step += 1
+            this_results = {}
+            logger.info(f'Oberserve the environment: Step {step}')
+            perception_infos, width, height, keyboard, screenshot_file = self.env.observe(
+            )
+
+            # decision
+            prompt_action = get_action_prompt(user_query, perception_infos,
+                                              width, height, keyboard,
+                                              summary_history, action_history,
+                                              summary, action, add_info,
+                                              error_flag,
+                                              completed_requirements, memory)
+            system_prompt_decision = get_system_prompt('decision')
+            messages = [{'role': 'system', 'content': system_prompt_decision}]
+            messages.append({'role': 'user', 'content': prompt_action})
+
+            self._parse_image_url([screenshot_file], messages)
+
+            logger.info(f'Call decision agent: Step {step}')
+            output_decision = self.llm_decision.chat(messages=messages)
+            # this_results['decision'] = output_action
+
+            thought = output_decision.split(
+                '### Thought ###')[-1].split('### Action ###')[0].replace(
+                    '\n', ' ').replace(':', '').replace('  ', ' ').strip()
+            summary = output_decision.split('### Operation ###')[-1].replace(
+                '\n', ' ').replace('  ', ' ').strip()
+            action = output_decision.split('### Action ###')[-1].split(
+                '### Operation ###')[0].replace('\n',
+                                                ' ').replace('  ',
+                                                             ' ').strip()
+
+            messages.append({'role': 'assistant', 'content': output_decision})
+
+            prompt_memory = get_memory_prompt()
+
+            messages.append({'role': 'user', 'content': prompt_memory})
+
+            logger.info(f'Call decision agent with action: Step {step}')
+            output_memory = self.llm_decision.chat(messages=messages)
+
+            messages.append({'role': 'assistant', 'content': output_memory})
+
+            output_memory = output_memory.split('### Important content ###')[
+                -1].split('\n\n')[0].strip() + '\n'
+            if 'None' not in output_memory and output_memory not in memory:
+                memory += output_memory
+                this_results['memory'] = output_memory
+
+            print_status(messages)
+
+            if self.env.act(action):
+                break
+
+            last_perception_infos = copy.deepcopy(perception_infos)
+            last_keyboard = keyboard
+            last_screenshot_file = screenshot_file
+
+            logger.info(f'Observe the environment before reflect: Step {step}')
+            perception_infos, width, height, keyboard, screenshot_file = self.env.observe(
+            )
+
+            # reflect
+            prompt_reflect = get_reflect_prompt(
+                user_query, last_perception_infos, perception_infos, width,
+                height, last_keyboard, keyboard, summary, action, add_info)
+            system_prompt_reflect = get_system_prompt('reflect')
+            messages = [{'role': 'system', 'content': system_prompt_reflect}]
+
+            messages.append({'role': 'user', 'content': prompt_reflect})
+
+            self._parse_image_url([last_screenshot_file, screenshot_file],
+                                  messages)
+
+            logger.info(f'Call reflect agent: Step {step}')
+            output_reflect = self.llm_reflect.chat(messages=messages)
+            this_results['reflect'] = output_reflect
+            reflect = output_reflect.split('### Answer ###')[-1].replace(
+                '\n', ' ').strip()
+            messages.append({'role': 'assistant', 'content': output_reflect})
+            print_status(messages)
+
+            if 'A' in reflect:
+                thought_history.append(thought)
+                summary_history.append(summary)
+                action_history.append(action)
+
+                prompt_memory = get_process_prompt(user_query, thought_history,
+                                                   summary_history,
+                                                   action_history,
+                                                   completed_requirements,
+                                                   add_info)
+                system_prompy_plan = get_system_prompt('plan')
+                messages = [{'role': 'system', 'content': system_prompy_plan}]
+                messages.append({'role': 'user', 'content': prompt_memory})
+
+                logger.info(f'Call planner agent: Step {step}')
+                output_memory = self.llm_planner.chat(messages=messages)
+
+                messages.append({
+                    'role': 'assistant',
+                    'content': output_memory
+                })
+                print_status(messages)
+
+                completed_requirements = output_memory.split(
+                    '### Completed contents ###')[-1].replace('\n',
+                                                              ' ').strip()
+                this_results['process'] = output_memory
+
+                error_flag = False
+
+            elif 'B' in reflect:
+                error_flag = True
+                self.env.act('Back')
+
+            elif 'C' in reflect:
+                error_flag = True
+
+
+def print_status(chat_history):
+    print('*' * 100)
+    for chat in chat_history:
+        print('role:', chat['role'])
+        content = chat['content']
+        if isinstance(content, str):
+            print(content)
+        else:
+            print(content[0]['text'] + '<image>' * (len(content[1]) - 1)
+                  + '\n')
+    print('*' * 100)
diff --git a/modelscope_agent/agents/mobile_agent_v2/prompt.py b/modelscope_agent/agents/mobile_agent_v2/prompt.py
new file mode 100644
index 00000000..333eba89
--- /dev/null
+++ b/modelscope_agent/agents/mobile_agent_v2/prompt.py
@@ -0,0 +1,228 @@
+# flake8: noqa
+import base64
+
+
+def encode_image(image_path):
+    with open(image_path, 'rb') as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+
+def get_action_prompt(instruction, clickable_infos, width, height, keyboard,
+                      summary_history, action_history, last_summary,
+                      last_action, add_info, error_flag, completed_content,
+                      memory):
+    prompt = '### Background ###\n'
+    prompt += f"This image is a phone screenshot. Its width is {width} pixels and its height is {height} pixels. The user\'s instruction is: {instruction}.\n\n"
+
+    prompt += '### Screenshot information ###\n'
+    prompt += 'In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. '
+    prompt += 'This information consists of two parts: coordinates; content. '
+    prompt += 'The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively. '
+    prompt += 'The information is as follow:\n'
+
+    for clickable_info in clickable_infos:
+        if clickable_info['text'] != '' and clickable_info[
+                'text'] != 'icon: None' and clickable_info['coordinates'] != (
+                    0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+
+    prompt += 'Please note that this information is not necessarily accurate. You need to combine the screenshot to understand.'
+    prompt += '\n\n'
+
+    prompt += '### Keyboard status ###\n'
+    prompt += 'We extract the keyboard status of the current screenshot and it is whether the keyboard of the current screenshot is activated.\n'
+    prompt += 'The keyboard status is as follow:\n'
+    if keyboard:
+        prompt += 'The keyboard has been activated and you can type.'
+    else:
+        prompt += "The keyboard has not been activated and you can\'t type."
+    prompt += '\n\n'
+
+    if add_info != '':
+        prompt += '### Hint ###\n'
+        prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
+        prompt += add_info
+        prompt += '\n\n'
+
+    if len(action_history) > 0:
+        prompt += '### History operations ###\n'
+        prompt += 'Before reaching this page, some operations have been completed. You need to refer to the completed operations to decide the next operation. These operations are as follow:\n'
+        for i in range(len(action_history)):
+            prompt += f'Step-{i+1}: [Operation: ' + summary_history[i].split(
+                ' to ')[0].strip() + '; Action: ' + action_history[i] + ']\n'
+        prompt += '\n'
+
+    if completed_content != '':
+        prompt += '### Progress ###\n'
+        prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
+        prompt += 'Completed contents:\n' + completed_content + '\n\n'
+
+    if memory != '':
+        prompt += '### Memory ###\n'
+        prompt += 'During the operations, you record the following contents on the screenshot for use in subsequent operations:\n'
+        prompt += 'Memory:\n' + memory + '\n'
+
+    if error_flag:
+        prompt += '### Last operation ###\n'
+        prompt += f"You previously wanted to perform the operation \"{last_summary}\" on this page and executed the Action \"{last_action}\". But you find that this operation does not meet your expectation. You need to reflect and revise your operation this time."
+        prompt += '\n\n'
+
+    prompt += '### Response requirements ###\n'
+    prompt += 'Now you need to combine all of the above to perform just one action on the current page. You must choose one of the six actions below:\n'
+    prompt += "Open app (app name): If the current page is desktop, you can use this action to open the app named \"app name\" on the desktop.\n"
+    prompt += 'Tap (x, y): Tap the position (x, y) in current page.\n'
+    prompt += 'Swipe (x1, y1), (x2, y2): Swipe from position (x1, y1) to position (x2, y2).\n'
+    if keyboard:
+        prompt += "Type (text): Type the \"text\" in the input box.\n"
+    else:
+        prompt += "Unable to Type. You cannot use the action \"Type\" because the keyboard has not been activated. If you want to type, please first activate the keyboard by tapping on the input box on the screen.\n"
+    prompt += 'Home: Return to home page.\n'
+    prompt += "Stop: If you think all the requirements of user\'s instruction have been completed and no further operation is required, you can choose this action to terminate the operation process."
+    prompt += '\n\n'
+
+    prompt += '### Output format ###\n'
+    prompt += 'Your output consists of the following three parts:\n'
+    prompt += '### Thought ###\nThink about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation.\n'
+    prompt += "### Action ###\nYou can only choose one from the six actions above. Make sure that the coordinates or text in the \"()\".\n"
+    prompt += '### Operation ###\nPlease generate a brief natural language description for the operation in Action based on your Thought.'
+
+    return prompt
+
+
+def get_reflect_prompt(instruction, clickable_infos1, clickable_infos2, width,
+                       height, keyboard1, keyboard2, summary, action,
+                       add_info):
+    prompt = f'These images are two phone screenshots before and after an operation. Their widths are {width} pixels and their heights are {height} pixels.\n\n'
+
+    prompt += 'In order to help you better perceive the content in this screenshot, we extract some information on the current screenshot through system files. '
+    prompt += 'The information consists of two parts, consisting of format: coordinates; content. '
+    prompt += 'The format of the coordinates is [x, y], x is the pixel from left to right and y is the pixel from top to bottom; the content is a text or an icon description respectively '
+    prompt += 'The keyboard status is whether the keyboard of the current page is activated.'
+    prompt += '\n\n'
+
+    prompt += '### Before the current operation ###\n'
+    prompt += 'Screenshot information:\n'
+    for clickable_info in clickable_infos1:
+        if clickable_info['text'] != '' and clickable_info[
+                'text'] != 'icon: None' and clickable_info['coordinates'] != (
+                    0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    prompt += 'Keyboard status:\n'
+    if keyboard1:
+        prompt += f'The keyboard has been activated.'
+    else:
+        prompt += 'The keyboard has not been activated.'
+    prompt += '\n\n'
+
+    prompt += '### After the current operation ###\n'
+    prompt += 'Screenshot information:\n'
+    for clickable_info in clickable_infos2:
+        if clickable_info['text'] != '' and clickable_info[
+                'text'] != 'icon: None' and clickable_info['coordinates'] != (
+                    0, 0):
+            prompt += f"{clickable_info['coordinates']}; {clickable_info['text']}\n"
+    prompt += 'Keyboard status:\n'
+    if keyboard2:
+        prompt += f'The keyboard has been activated.'
+    else:
+        prompt += 'The keyboard has not been activated.'
+    prompt += '\n\n'
+
+    prompt += '### Current operation ###\n'
+    prompt += f"The user\'s instruction is: {instruction}. You also need to note the following requirements: {add_info}. In the process of completing the requirements of instruction, an operation is performed on the phone. Below are the details of this operation:\n"
+    prompt += 'Operation thought: ' + summary.split(' to ')[0].strip() + '\n'
+    prompt += 'Operation action: ' + action
+    prompt += '\n\n'
+
+    prompt += '### Response requirements ###\n'
+    prompt += 'Now you need to output the following content based on the screenshots before and after the current operation:\n'
+    prompt += "Whether the result of the \"Operation action\" meets your expectation of \"Operation thought\"?\n"
+    prompt += "A: The result of the \"Operation action\" meets my expectation of \"Operation thought\".\n"
+    prompt += "B: The \"Operation action\" results in a wrong page and I need to return to the previous page.\n"
+    prompt += "C: The \"Operation action\" produces no changes."
+    prompt += '\n\n'
+
+    prompt += '### Output format ###\n'
+    prompt += 'Your output format is:\n'
+    prompt += '### Thought ###\nYour thought about the question\n'
+    prompt += '### Answer ###\nA or B or C'
+
+    return prompt
+
+
+def get_memory_prompt(insight=''):
+    if insight != '':
+        prompt = '### Important content ###\n'
+        prompt += insight
+        prompt += '\n\n'
+
+        prompt += '### Response requirements ###\n'
+        prompt += "Please think about whether there is any content closely related to ### Important content ### on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
+
+    else:
+        prompt = '### Response requirements ###\n'
+        prompt += "Please think about whether there is any content closely related to user\'s instrcution on the current page? If there is, please output the content. If not, please output \"None\".\n\n"
+
+    prompt += '### Output format ###\n'
+    prompt += 'Your output format is:\n'
+    prompt += '### Important content ###\nThe content or None. Please do not repeatedly output the information in ### Memory ###.'
+
+    return prompt
+
+
+def get_process_prompt(instruction, thought_history, summary_history,
+                       action_history, completed_content, add_info):
+    prompt = '### Background ###\n'
+    prompt += f"There is an user\'s instruction which is: {instruction}. You are a mobile phone operating assistant and are operating the user\'s mobile phone.\n\n"
+
+    if add_info != '':
+        prompt += '### Hint ###\n'
+        prompt += "There are hints to help you complete the user\'s instructions. The hints are as follow:\n"
+        prompt += add_info
+        prompt += '\n\n'
+
+    if len(thought_history) > 1:
+        prompt += '### History operations ###\n'
+        prompt += "To complete the requirements of user\'s instruction, you have performed a series of operations. These operations are as follow:\n"
+        for i in range(len(summary_history)):
+            operation = summary_history[i].split(' to ')[0].strip()
+            prompt += f'Step-{i+1}: [Operation thought: ' + operation + '; Operation action: ' + action_history[
+                i] + ']\n'
+        prompt += '\n'
+
+        prompt += '### Progress thinking ###\n'
+        prompt += "After completing the history operations, you have the following thoughts about the progress of user\'s instruction completion:\n"
+        prompt += 'Completed contents:\n' + completed_content + '\n\n'
+
+        prompt += '### Response requirements ###\n'
+        prompt += "Now you need to update the \"Completed contents\". Completed contents is a general summary of the current contents that have been completed based on the ### History operations ###.\n\n"
+
+        prompt += '### Output format ###\n'
+        prompt += 'Your output format is:\n'
+        prompt += "### Completed contents ###\nUpdated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### History operations ###."
+
+    else:
+        prompt += '### Current operation ###\n'
+        prompt += "To complete the requirements of user\'s instruction, you have performed an operation. Your operation thought and action of this operation are as follows:\n"
+        prompt += f'Operation thought: {thought_history[-1]}\n'
+        operation = summary_history[-1].split(' to ')[0].strip()
+        prompt += f'Operation action: {operation}\n\n'
+
+        prompt += '### Response requirements ###\n'
+        prompt += "Now you need to combine all of the above to generate the \"Completed contents\".\n"
+        prompt += "Completed contents is a general summary of the current contents that have been completed. You need to first focus on the requirements of user\'s instruction, and then summarize the contents that have been completed.\n\n"
+
+        prompt += '### Output format ###\n'
+        prompt += 'Your output format is:\n'
+        prompt += "### Completed contents ###\nGenerated Completed contents. Don\'t output the purpose of any operation. Just summarize the contents that have been actually completed in the ### Current operation ###.\n"
+        prompt += '(Please use English to output)'
+
+    return prompt
+
+
+def get_system_prompt(mode):
+    if mode == 'decision':
+        sysetm_prompt = "You are a helpful AI mobile phone operating assistant. You need to help me operate the phone to complete the user\'s instruction."
+    else:
+        sysetm_prompt = 'You are a helpful AI mobile phone operating assistant.'
+    return sysetm_prompt
diff --git a/modelscope_agent/agents_registry.py b/modelscope_agent/agents_registry.py
index 695cf924..6123ae35 100644
--- a/modelscope_agent/agents_registry.py
+++ b/modelscope_agent/agents_registry.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List
 
 from modelscope_agent.agent import Agent
 from modelscope_agent.environment import Environment
diff --git a/modelscope_agent/environment/__init__.py b/modelscope_agent/environment/__init__.py
new file mode 100644
index 00000000..8b48e6fd
--- /dev/null
+++ b/modelscope_agent/environment/__init__.py
@@ -0,0 +1,2 @@
+from .android_adb import ADBEnvironment
+from .environment import Environment
diff --git a/modelscope_agent/environment/android_adb/__init__.py b/modelscope_agent/environment/android_adb/__init__.py
new file mode 100644
index 00000000..90585d6c
--- /dev/null
+++ b/modelscope_agent/environment/android_adb/__init__.py
@@ -0,0 +1 @@
+from .android_adb_env import ADBEnvironment
diff --git a/modelscope_agent/environment/android_adb/android_adb_env.py b/modelscope_agent/environment/android_adb/android_adb_env.py
new file mode 100644
index 00000000..dd08969b
--- /dev/null
+++ b/modelscope_agent/environment/android_adb/android_adb_env.py
@@ -0,0 +1,315 @@
+import asyncio
+import os
+import shutil
+import subprocess
+import time
+from typing import List
+
+from modelscope_agent.utils.logger import agent_logger as logger
+from PIL import Image
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from .utils import (agenerate, crop, det, draw_coordinates_on_image,
+                    encode_image, get_all_files_in_folder, merge_text_blocks,
+                    ocr)
+
+
+class ADBEnvironment:
+
+    def __init__(self, adb_path: str) -> None:
+        self.adb_path = adb_path
+
+        # ocr pipeline
+        self.ocr_detection = pipeline(
+            Tasks.ocr_detection,
+            model='damo/cv_resnet18_ocr-detection-line-level_damo')
+        self.ocr_recognition = pipeline(
+            Tasks.ocr_recognition,
+            model='damo/cv_convnextTiny_ocr-recognition-document_damo')
+
+        # groundingdino model
+        model_dir = snapshot_download(
+            'AI-ModelScope/GroundingDINO', revision='v1.0.0')
+        self.groundingdino = pipeline('grounding-dino-task', model=model_dir)
+
+        self.temp_dir = 'temp'
+
+        self.screenshot_dir = 'screenshot'
+        if not os.path.exists(self.temp_dir):
+            os.mkdir(self.temp_dir)
+        if not os.path.exists(self.screenshot_dir):
+            os.mkdir(self.screenshot_dir)
+
+        self.screenshot_file = os.path.join(self.screenshot_dir,
+                                            'screenshot.jpg')
+        self.last_screenshot_file = os.path.join(self.screenshot_dir,
+                                                 'last_screenshot.jpg')
+
+    def observe(self):
+        perception_infos, width, height, keyboard = asyncio.run(
+            self.get_perception_infos(self.screenshot_file))
+        screenshot_file = encode_image(self.screenshot_file)
+        return perception_infos, width, height, keyboard, screenshot_file
+
+    def act(self, action):
+        if 'Open app' in action:
+            app_name = action.split('(')[-1].split(')')[0]
+            text, coordinate = ocr(self.screenshot_file, self.ocr_detection,
+                                   self.ocr_recognition)
+            # tap_coordinate = [0, 0]
+            for ti in range(len(text)):
+                if app_name == text[ti]:
+                    name_coordinate = [
+                        int((coordinate[ti][0] + coordinate[ti][2]) / 2),
+                        int((coordinate[ti][1] + coordinate[ti][3]) / 2)
+                    ]
+                    self.tap(name_coordinate[0], name_coordinate[1]
+                             - int(coordinate[ti][3] - coordinate[ti][1]))  #
+
+        elif 'Tap' in action:
+            coordinate = action.split('(')[-1].split(')')[0].split(', ')
+            x, y = int(coordinate[0]), int(coordinate[1])
+            self.tap(x, y)
+
+        elif 'Swipe' in action:
+            coordinate1 = action.split('Swipe (')[-1].split('), (')[0].split(
+                ', ')
+            coordinate2 = action.split('), (')[-1].split(')')[0].split(', ')
+            x1, y1 = int(coordinate1[0]), int(coordinate1[1])
+            x2, y2 = int(coordinate2[0]), int(coordinate2[1])
+            self.slide(x1, y1, x2, y2)
+
+        elif 'Type' in action:
+            if '(text)' not in action:
+                text = action.split('(')[-1].split(')')[0]
+            else:
+                text = action.split(" \"")[-1].split("\"")[0]
+            self.type(text)
+
+        elif 'Back' in action:
+            self.back()
+
+        elif 'Home' in action:
+            self.home()
+
+        elif 'Stop' in action:
+            return True
+        time.sleep(5)
+        if os.path.exists(self.last_screenshot_file):
+            os.remove(self.last_screenshot_file)
+        os.rename(self.screenshot_file, self.last_screenshot_file)
+        return False
+
+    async def get_perception_infos(self, screenshot_file):
+
+        logger.info('Start getting perception infos')
+        self.get_screenshot()
+
+        width, height = Image.open(screenshot_file).size
+        logger.info('Start use OCR get text and coordinates')
+        text, coordinates = ocr(screenshot_file, self.ocr_detection,
+                                self.ocr_recognition)
+        text, coordinates = merge_text_blocks(text, coordinates)
+        logger.info('End use OCR get text and coordinates')
+
+        center_list = [[(coordinate[0] + coordinate[2]) / 2,
+                        (coordinate[1] + coordinate[3]) / 2]
+                       for coordinate in coordinates]
+        draw_coordinates_on_image(screenshot_file, center_list)
+
+        perception_infos = []
+        for i in range(len(coordinates)):
+            perception_info = {
+                'text': 'text: ' + text[i],
+                'coordinates': coordinates[i]
+            }
+            perception_infos.append(perception_info)
+
+        logger.info('Start use groundino to detect icons')
+        coordinates = det(screenshot_file, 'icon', self.groundingdino)
+        logger.info('End use groundino to detect icons')
+
+        for i in range(len(coordinates)):
+            perception_info = {'text': 'icon', 'coordinates': coordinates[i]}
+            perception_infos.append(perception_info)
+
+        image_box = []
+        image_id = []
+        for i in range(len(perception_infos)):
+            if perception_infos[i]['text'] == 'icon':
+                image_box.append(perception_infos[i]['coordinates'])
+                image_id.append(i)
+
+        for i in range(len(image_box)):
+            crop(screenshot_file, image_box[i], image_id[i])
+
+        images = get_all_files_in_folder(self.temp_dir)
+
+        logger.info('Start use qwen-vl to describe icons')
+        if len(images) > 0:
+            images = sorted(
+                images, key=lambda x: int(x.split('/')[-1].split('.')[0]))
+            image_id = [
+                int(image.split('/')[-1].split('.')[0]) for image in images
+            ]
+            icon_map = {}
+            # Please describe this icon.
+            tasks = []
+            idx_arr = []
+            prompt = 'This image is an icon from a phone screen. Please describe the color and shape of this icon.'
+            for i in range(len(images)):
+                image_path = os.path.join(self.temp_dir, images[i])
+                icon_width, icon_height = Image.open(image_path).size
+                if icon_height > 0.8 * height or icon_width * icon_height > 0.2 * width * height:
+                    des = 'None'
+                    icon_map[i + 1] = des
+                else:
+                    task = agenerate(image_path, prompt)
+                    idx_arr.append(i)
+                    tasks.append(task)
+
+            descriptions = await asyncio.gather(*tasks)
+            for i, j in zip(idx_arr, range(len(descriptions))):
+                icon_map[i + 1] = descriptions[j]
+
+            for i, j in zip(image_id, range(1, len(image_id) + 1)):
+                if icon_map.get(j):
+                    perception_infos[i]['text'] = 'icon: ' + icon_map[j]
+
+        logger.info('End use qwen-vl to describe icons')
+        for i in range(len(perception_infos)):
+            perception_infos[i]['coordinates'] = [
+                int((perception_infos[i]['coordinates'][0]
+                     + perception_infos[i]['coordinates'][2]) / 2),
+                int((perception_infos[i]['coordinates'][1]
+                     + perception_infos[i]['coordinates'][3]) / 2)
+            ]
+
+        shutil.rmtree(self.temp_dir)
+        os.mkdir(self.temp_dir)
+
+        keyboard = False
+        for perception_info in perception_infos:
+            if perception_info['coordinates'][1] < 0.95 * height:
+                continue
+            if 'ADB Keyboard' in perception_info['text']:
+                keyboard = True
+                break
+        logger.info('Finish getting perception infos')
+        return perception_infos, width, height, keyboard
+
+    # ADB related functions
+    def get_size(self):
+        command = self.adb_path + ' shell wm size'
+        result = subprocess.run(
+            command, capture_output=True, text=True, shell=True)
+        resolution_line = result.stdout.strip().split('\n')[-1]
+        width, height = map(int, resolution_line.split(' ')[-1].split('x'))
+        return width, height
+
+    def get_xml(self):
+        adb_path = self.adb_path
+        process = subprocess.Popen([adb_path, 'shell', 'uiautomator', 'dump'],
+                                   stdout=subprocess.PIPE)
+        process.communicate()
+        subprocess.run([
+            adb_path, 'pull', '/sdcard/window_dump.xml',
+            './xml/window_dump.xml'
+        ])
+
+    def take_screenshots(self, num_screenshots, output_folder, crop_y_start,
+                         crop_y_end, slide_y_start, slide_y_end):
+        adb_path = self.adb_path
+        if not os.path.exists(output_folder):
+            os.makedirs(output_folder)
+
+        for i in range(num_screenshots):
+            command = adb_path + f' shell rm /sdcard/screenshot{i}.png'
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+            command = adb_path + f' shell screencap -p /sdcard/screenshot{i}.png'
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+            command = adb_path + f' pull /sdcard/screenshot{i}.png {output_folder}'
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+            image = Image.open(f'{output_folder}/screenshot{i}.png')
+            cropped_image = image.crop(
+                (0, crop_y_start, image.width, crop_y_end))
+            cropped_image.save(f'{output_folder}/screenshot{i}.png')
+            subprocess.run([
+                adb_path, 'shell', 'input', 'swipe', '500',
+                str(slide_y_start), '500',
+                str(slide_y_end)
+            ])
+
+    def get_screenshot(self):
+        adb_path = self.adb_path
+        command = adb_path + ' shell rm /sdcard/screenshot.png'
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+        time.sleep(0.5)
+        command = adb_path + ' shell screencap -p /sdcard/screenshot.png'
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+        time.sleep(0.5)
+        command = adb_path + ' pull /sdcard/screenshot.png ./screenshot'
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+        image_path = './screenshot/screenshot.png'
+        save_path = './screenshot/screenshot.jpg'
+        image = Image.open(image_path)
+        image.convert('RGB').save(save_path, 'JPEG')
+        os.remove(image_path)
+
+    def get_keyboard(self):
+        adb_path = self.adb_path
+        command = adb_path + ' shell dumpsys input_method'
+        process = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            shell=True,
+            encoding='utf-8')
+        output = process.stdout.strip()
+        for line in output.split('\n'):
+            if 'mInputShown' in line:
+                if 'mInputShown=true' in line:
+
+                    for line in output.split('\n'):
+                        if 'hintText' in line:
+                            hintText = line.split('hintText=')[-1].split(
+                                ' label')[0]
+                            break
+
+                    return True, hintText
+                elif 'mInputShown=false' in line:
+                    return False, None
+
+    def tap(self, x, y):
+        command = self.adb_path + f' shell input tap {x} {y}'
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+
+    def type(self, text):
+        adb_path = self.adb_path
+        text = text.replace('\\n', '_').replace('\n', '_')
+        for char in text:
+            if char == ' ':
+                command = adb_path + ' shell input text %s'
+            elif char == '_':
+                command = adb_path + ' shell input keyevent 66'
+            elif 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit():
+                command = adb_path + f' shell input text {char}'
+            elif char in '-.,!?@\'°/:;()':
+                command = adb_path + f" shell input text \"{char}\""
+            else:
+                command = adb_path + f" shell am broadcast -a ADB_INPUT_TEXT --es msg \"{char}\""
+            subprocess.run(command, capture_output=True, text=True, shell=True)
+
+    def slide(self, x1, y1, x2, y2):
+        command = self.adb_path + f' shell input swipe {x1} {y1} {x2} {y2} 500'
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+
+    def back(self):
+        command = self.adb_path + ' shell input keyevent 4'
+        subprocess.run(command, capture_output=True, text=True, shell=True)
+
+    def home(self):
+        command = self.adb_path + ' shell am start -a android.intent.action.MAIN -c android.intent.category.HOME'
+        subprocess.run(command, capture_output=True, text=True, shell=True)
diff --git a/modelscope_agent/environment/android_adb/utils.py b/modelscope_agent/environment/android_adb/utils.py
new file mode 100644
index 00000000..d08f012c
--- /dev/null
+++ b/modelscope_agent/environment/android_adb/utils.py
@@ -0,0 +1,419 @@
+# flake8: noqa
+
+import base64
+import math
+import os
+
+import cv2
+import numpy as np
+import torch
+from modelscope_agent.utils.retry import retry
+from PIL import Image, ImageDraw
+
+
+def crop_image(img, position):
+
+    def distance(x1, y1, x2, y2):
+        return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
+
+    position = position.tolist()
+    for i in range(4):
+        for j in range(i + 1, 4):
+            if (position[i][0] > position[j][0]):
+                tmp = position[j]
+                position[j] = position[i]
+                position[i] = tmp
+    if position[0][1] > position[1][1]:
+        tmp = position[0]
+        position[0] = position[1]
+        position[1] = tmp
+
+    if position[2][1] > position[3][1]:
+        tmp = position[2]
+        position[2] = position[3]
+        position[3] = tmp
+
+    x1, y1 = position[0][0], position[0][1]
+    x2, y2 = position[2][0], position[2][1]
+    x3, y3 = position[3][0], position[3][1]
+    x4, y4 = position[1][0], position[1][1]
+
+    corners = np.zeros((4, 2), np.float32)
+    corners[0] = [x1, y1]
+    corners[1] = [x2, y2]
+    corners[2] = [x4, y4]
+    corners[3] = [x3, y3]
+
+    img_width = distance((x1 + x4) / 2, (y1 + y4) / 2, (x2 + x3) / 2,
+                         (y2 + y3) / 2)
+    img_height = distance((x1 + x2) / 2, (y1 + y2) / 2, (x4 + x3) / 2,
+                          (y4 + y3) / 2)
+
+    corners_trans = np.zeros((4, 2), np.float32)
+    corners_trans[0] = [0, 0]
+    corners_trans[1] = [img_width - 1, 0]
+    corners_trans[2] = [0, img_height - 1]
+    corners_trans[3] = [img_width - 1, img_height - 1]
+
+    transform = cv2.getPerspectiveTransform(corners, corners_trans)
+    dst = cv2.warpPerspective(img, transform,
+                              (int(img_width), int(img_height)))
+    return dst
+
+
+def calculate_size(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+
+def calculate_iou(box1, box2):
+    xA = max(box1[0], box2[0])
+    yA = max(box1[1], box2[1])
+    xB = min(box1[2], box2[2])
+    yB = min(box1[3], box2[3])
+
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    unionArea = box1Area + box2Area - interArea
+    iou = interArea / unionArea
+
+    return iou
+
+
+def in_box(box, target):
+    if (box[0] > target[0]) and (box[1] > target[1]) and (
+            box[2] < target[2]) and (box[3] < target[3]):
+        return True
+    else:
+        return False
+
+
+def crop_for_clip(image, box, i, position):
+    image = Image.open(image)
+    w, h = image.size
+    if position == 'left':
+        bound = [0, 0, w / 2, h]
+    elif position == 'right':
+        bound = [w / 2, 0, w, h]
+    elif position == 'top':
+        bound = [0, 0, w, h / 2]
+    elif position == 'bottom':
+        bound = [0, h / 2, w, h]
+    elif position == 'top left':
+        bound = [0, 0, w / 2, h / 2]
+    elif position == 'top right':
+        bound = [w / 2, 0, w, h / 2]
+    elif position == 'bottom left':
+        bound = [0, h / 2, w / 2, h]
+    elif position == 'bottom right':
+        bound = [w / 2, h / 2, w, h]
+    else:
+        bound = [0, 0, w, h]
+
+    if in_box(box, bound):
+        cropped_image = image.crop(box)
+        cropped_image.save(f'./temp/{i}.jpg')
+        return True
+    else:
+        return False
+
+
+def clip_for_icon(clip_model, clip_preprocess, images, prompt):
+    image_features = []
+    for image_file in images:
+        image = clip_preprocess(Image.open(image_file)).unsqueeze(0).to(
+            next(clip_model.parameters()).device)
+        image_feature = clip_model.encode_image(image)
+        image_features.append(image_feature)
+    image_features = torch.cat(image_features)
+
+    import clip
+    text = clip.tokenize([prompt]).to(next(clip_model.parameters()).device)
+    text_features = clip_model.encode_text(text)
+
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    similarity = (100.0 * image_features
+                  @ text_features.T).softmax(dim=0).squeeze(0)
+    _, max_pos = torch.max(similarity, dim=0)
+    pos = max_pos.item()
+
+    return pos
+
+
+def order_point(coor):
+    arr = np.array(coor).reshape([4, 2])
+    sum_ = np.sum(arr, 0)
+    centroid = sum_ / arr.shape[0]
+    theta = np.arctan2(arr[:, 1] - centroid[1], arr[:, 0] - centroid[0])
+    sort_points = arr[np.argsort(theta)]
+    sort_points = sort_points.reshape([4, -1])
+    if sort_points[0][0] > centroid[0]:
+        sort_points = np.concatenate([sort_points[3:], sort_points[:3]])
+    sort_points = sort_points.reshape([4, 2]).astype('float32')
+    return sort_points
+
+
+def longest_common_substring_length(str1, str2):
+    m = len(str1)
+    n = len(str2)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if str1[i - 1] == str2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+
+    return dp[m][n]
+
+
+def ocr(image_path, ocr_detection, ocr_recognition):
+    text_data = []
+    coordinate = []
+
+    image_full = cv2.imread(image_path)
+    det_result = ocr_detection(image_full)
+    det_result = det_result['polygons']
+    for i in range(det_result.shape[0]):
+        pts = order_point(det_result[i])
+        image_crop = crop_image(image_full, pts)
+
+        try:
+            result = ocr_recognition(image_crop)['text'][0]
+        except Exception as e:
+            print(e)
+            continue
+
+        box = [int(e) for e in list(pts.reshape(-1))]
+        box = [box[0], box[1], box[4], box[5]]
+
+        text_data.append(result)
+        coordinate.append(box)
+
+    else:
+        return text_data, coordinate
+
+
+def remove_boxes(boxes_filt, size, iou_threshold=0.5):
+    boxes_to_remove = set()
+
+    for i in range(len(boxes_filt)):
+        if calculate_size(boxes_filt[i]) > 0.05 * size[0] * size[1]:
+            boxes_to_remove.add(i)
+        for j in range(len(boxes_filt)):
+            if calculate_size(boxes_filt[j]) > 0.05 * size[0] * size[1]:
+                boxes_to_remove.add(j)
+            if i == j:
+                continue
+            if i in boxes_to_remove or j in boxes_to_remove:
+                continue
+            iou = calculate_iou(boxes_filt[i], boxes_filt[j])
+            if iou >= iou_threshold:
+                boxes_to_remove.add(j)
+
+    boxes_filt = [
+        box for idx, box in enumerate(boxes_filt) if idx not in boxes_to_remove
+    ]
+
+    return boxes_filt
+
+
+def det(input_image_path,
+        caption,
+        groundingdino_model,
+        box_threshold=0.07,
+        text_threshold=0.5):
+    image = Image.open(input_image_path)
+    size = image.size
+
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith('.'):
+        caption = caption + '.'
+    inputs = {
+        'IMAGE_PATH': input_image_path,
+        'TEXT_PROMPT': caption,
+        'BOX_TRESHOLD': box_threshold,
+        'TEXT_TRESHOLD': text_threshold
+    }
+
+    result = groundingdino_model(inputs)
+    boxes_filt = result['boxes']
+
+    H, W = size[1], size[0]
+    for i in range(boxes_filt.size(0)):
+        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+        boxes_filt[i][2:] += boxes_filt[i][:2]
+
+    boxes_filt = boxes_filt.cpu().int().tolist()
+    filtered_boxes = remove_boxes(boxes_filt, size)  # [:9]
+    coordinates = []
+    for box in filtered_boxes:
+        coordinates.append([box[0], box[1], box[2], box[3]])
+
+    return coordinates
+
+
+def get_all_files_in_folder(folder_path):
+    file_list = []
+    for file_name in os.listdir(folder_path):
+        file_list.append(file_name)
+    return file_list
+
+
+def draw_coordinates_on_image(image_path, coordinates):
+    image = Image.open(image_path)
+    draw = ImageDraw.Draw(image)
+    point_size = 10
+    for coord in coordinates:
+        draw.ellipse((coord[0] - point_size, coord[1] - point_size,
+                      coord[0] + point_size, coord[1] + point_size),
+                     fill='red')
+    output_image_path = './screenshot/output_image.png'
+    image.save(output_image_path)
+    return output_image_path
+
+
+def crop(image, box, i):
+    image = Image.open(image)
+    x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
+    if x1 >= x2 - 10 or y1 >= y2 - 10:
+        return
+    cropped_image = image.crop((x1, y1, x2, y2))
+    cropped_image.save(f'./temp/{i}.jpg')
+
+
+@retry(max_retries=5, delay_seconds=0.5)
+def generate(image_file, query):
+
+    from dashscope import MultiModalConversation
+
+    local_file_path = f'file://{image_file}'
+    messages = [{
+        'role': 'user',
+        'content': [
+            {
+                'image': local_file_path
+            },
+            {
+                'text': query
+            },
+        ]
+    }]
+    response = MultiModalConversation.call(
+        model='qwen-vl-plus',
+        messages=messages,
+        api_key=os.getenv('DASHSCOPE_API_KEY'))
+
+    return response.output.choices[0].message.content[0]['text']
+
+
+async def agenerate(image_path, prompt):
+    import asyncio
+    loop = asyncio.get_running_loop()
+    result = await loop.run_in_executor(None, generate, image_path, prompt)
+    return result
+
+
+def merge_text_blocks(text_list, coordinates_list):
+    merged_text_blocks = []
+    merged_coordinates = []
+
+    sorted_indices = sorted(
+        range(len(coordinates_list)),
+        key=lambda k: (coordinates_list[k][1], coordinates_list[k][0]))
+    sorted_text_list = [text_list[i] for i in sorted_indices]
+    sorted_coordinates_list = [coordinates_list[i] for i in sorted_indices]
+
+    num_blocks = len(sorted_text_list)
+    merge = [False] * num_blocks
+
+    for i in range(num_blocks):
+        if merge[i]:
+            continue
+
+        anchor = i
+
+        group_text = [sorted_text_list[anchor]]
+        group_coordinates = [sorted_coordinates_list[anchor]]
+
+        for j in range(i + 1, num_blocks):
+            if merge[j]:
+                continue
+
+            if abs(sorted_coordinates_list[anchor][0] - sorted_coordinates_list[j][0]) < 10 and \
+            sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] >= -10 and sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] < 30 and \
+            abs(sorted_coordinates_list[anchor][3] - sorted_coordinates_list[anchor][1] - (sorted_coordinates_list[j][3] - sorted_coordinates_list[j][1])) < 10:
+                group_text.append(sorted_text_list[j])
+                group_coordinates.append(sorted_coordinates_list[j])
+                merge[anchor] = True
+                anchor = j
+                merge[anchor] = True
+
+        merged_text = '\n'.join(group_text)
+        min_x1 = min(group_coordinates, key=lambda x: x[0])[0]
+        min_y1 = min(group_coordinates, key=lambda x: x[1])[1]
+        max_x2 = max(group_coordinates, key=lambda x: x[2])[2]
+        max_y2 = max(group_coordinates, key=lambda x: x[3])[3]
+
+        merged_text_blocks.append(merged_text)
+        merged_coordinates.append([min_x1, min_y1, max_x2, max_y2])
+
+    return merged_text_blocks, merged_coordinates
+
+
+def merge_text_blocks(text_list, coordinates_list):
+    merged_text_blocks = []
+    merged_coordinates = []
+
+    sorted_indices = sorted(
+        range(len(coordinates_list)),
+        key=lambda k: (coordinates_list[k][1], coordinates_list[k][0]))
+    sorted_text_list = [text_list[i] for i in sorted_indices]
+    sorted_coordinates_list = [coordinates_list[i] for i in sorted_indices]
+
+    num_blocks = len(sorted_text_list)
+    merge = [False] * num_blocks
+
+    for i in range(num_blocks):
+        if merge[i]:
+            continue
+
+        anchor = i
+
+        group_text = [sorted_text_list[anchor]]
+        group_coordinates = [sorted_coordinates_list[anchor]]
+
+        for j in range(i + 1, num_blocks):
+            if merge[j]:
+                continue
+
+            if abs(sorted_coordinates_list[anchor][0] - sorted_coordinates_list[j][0]) < 10 and \
+            sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] >= -10 and sorted_coordinates_list[j][1] - sorted_coordinates_list[anchor][3] < 30 and \
+            abs(sorted_coordinates_list[anchor][3] - sorted_coordinates_list[anchor][1] - (sorted_coordinates_list[j][3] - sorted_coordinates_list[j][1])) < 10:
+                group_text.append(sorted_text_list[j])
+                group_coordinates.append(sorted_coordinates_list[j])
+                merge[anchor] = True
+                anchor = j
+                merge[anchor] = True
+
+        merged_text = '\n'.join(group_text)
+        min_x1 = min(group_coordinates, key=lambda x: x[0])[0]
+        min_y1 = min(group_coordinates, key=lambda x: x[1])[1]
+        max_x2 = max(group_coordinates, key=lambda x: x[2])[2]
+        max_y2 = max(group_coordinates, key=lambda x: x[3])[3]
+
+        merged_text_blocks.append(merged_text)
+        merged_coordinates.append([min_x1, min_y1, max_x2, max_y2])
+
+    return merged_text_blocks, merged_coordinates
+
+
+def encode_image(image_path):
+    with open(image_path, 'rb') as image_file:
+        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+    img = f'data:image/jpeg;base64,{base64_image}'
+    return img
diff --git a/modelscope_agent/environment.py b/modelscope_agent/environment/environment.py
similarity index 100%
rename from modelscope_agent/environment.py
rename to modelscope_agent/environment/environment.py
diff --git a/modelscope_agent/multi_agents_utils/README.md b/modelscope_agent/multi_agents_utils/README.md
index 08340c5c..4c301bc8 100644
--- a/modelscope_agent/multi_agents_utils/README.md
+++ b/modelscope_agent/multi_agents_utils/README.md
@@ -48,7 +48,7 @@ Considering the current status of ModelScope-Agent, the following design solutio
 
 1. **Decouple multi-agent interactive logic from single-agent Logic:**
    - Use **[AgentEnvMixin](../agent_env_util.py)** class to handle all of multi-agent communication logic based on **Ray**, without changing any origin logic in single agent modules.
-   - Extract environment information in **[Environment](../environment.py)** module, using a publishing/subscribe mechanism to advance interactions without execution-level blocking between agents.
+   - Extract environment information in **[Environment](../environment/environment.py)** module, using a publishing/subscribe mechanism to advance interactions without execution-level blocking between agents.
    - Message hub is maintained in **Environment** module, meanwhile each multi-agent actor manage their own history
 
 2. **Introduce an *[Agent Registry Center](../agents_registry.py)* Concept:**
diff --git a/modelscope_agent/multi_agents_utils/README_CN.md b/modelscope_agent/multi_agents_utils/README_CN.md
index fa4baf82..67423b37 100644
--- a/modelscope_agent/multi_agents_utils/README_CN.md
+++ b/modelscope_agent/multi_agents_utils/README_CN.md
@@ -49,7 +49,7 @@
 
 1. **将multi-agent的交互逻辑与single-agent的逻辑解耦:**
    - 使用**[AgentEnvMixin](../agent_env_util.py)**类基于Ray处理所有multi-agent通信逻辑，无需更改任何现有single-agent模块中的原始逻辑。
-   - 在**[Environment](../environment.py)**模块中管理环境信息，使用发布/订阅机制来推动agent之间的互动，而不会在执行层面阻塞agent。
+   - 在**[Environment](../environment/environment.py)**模块中管理环境信息，使用发布/订阅机制来推动agent之间的互动，而不会在执行层面阻塞agent。
    - 消息中心维护在Environment模块中，同时每个各个agent也单独管理自己的历史记录。
 
 2. **引入agent注册中心[Agent Registry Center](../agents_registry.py)概念:**
@@ -471,7 +471,7 @@ agent_env_mixin中的主要方法是`step`，它包含以下步骤：
 * 调用`publish`,将响应消息发布到环境，消息中包含了哪些角色应该接收消息的信息。
 
 
-### [environment](../environment.py)详细信息
+### [environment](../environment/environment.py)详细信息
 environment用来管理消息中心，它维护了以下信息：
 
 *在队列中存储发送给每个agent的消息，并且这些消息会在下一个步骤中从队列中弹出，并被每个agent拉取。
diff --git a/modelscope_agent/multi_agents_utils/executors/local.py b/modelscope_agent/multi_agents_utils/executors/local.py
index 11bd7c13..694b7a3c 100644
--- a/modelscope_agent/multi_agents_utils/executors/local.py
+++ b/modelscope_agent/multi_agents_utils/executors/local.py
@@ -2,7 +2,7 @@
 
 from modelscope_agent.agents_registry import AgentRegistry
 from modelscope_agent.constants import USER_REQUIREMENT
-from modelscope_agent.environment import Environment
+from modelscope_agent.environment.environment import Environment
 from modelscope_agent.schemas import Message
 
 
diff --git a/modelscope_agent/multi_agents_utils/executors/ray.py b/modelscope_agent/multi_agents_utils/executors/ray.py
index 20ebdd48..2034fc5c 100644
--- a/modelscope_agent/multi_agents_utils/executors/ray.py
+++ b/modelscope_agent/multi_agents_utils/executors/ray.py
@@ -4,7 +4,7 @@
 import ray
 from modelscope_agent.agents_registry import AgentRegistry
 from modelscope_agent.constants import USER_REQUIREMENT
-from modelscope_agent.environment import Environment
+from modelscope_agent.environment.environment import Environment
 from modelscope_agent.schemas import Message
 from ray._raylet import ObjectRefGenerator
 
diff --git a/tests/test_agent_registry.py b/tests/test_agent_registry.py
index 02880f4e..33bd3cf2 100644
--- a/tests/test_agent_registry.py
+++ b/tests/test_agent_registry.py
@@ -1,4 +1,4 @@
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 from modelscope_agent import create_component