From 692ffb6d436aca0d98b259a1ee6ab5f4453b8e18 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Sat, 7 Sep 2024 09:34:33 +0800 Subject: [PATCH] better detect dropdown menu (#778) --- .../forge/prompts/skyvern/custom-select.j2 | 1 + skyvern/webeye/actions/handler.py | 101 +++++++++++++++--- skyvern/webeye/scraper/domUtils.js | 29 ++++- skyvern/webeye/scraper/scraper.py | 15 ++- skyvern/webeye/utils/dom.py | 17 +++ skyvern/webeye/utils/page.py | 4 + streamlit_app/visualizer/api.py | 8 +- 7 files changed, 144 insertions(+), 31 deletions(-) diff --git a/skyvern/forge/prompts/skyvern/custom-select.j2 b/skyvern/forge/prompts/skyvern/custom-select.j2 index 3b1c9a0c7f..e8023ec208 100644 --- a/skyvern/forge/prompts/skyvern/custom-select.j2 +++ b/skyvern/forge/prompts/skyvern/custom-select.j2 @@ -14,6 +14,7 @@ Reply in JSON format with the following keys: "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence "id": str, // The id of the element to take action on. The id has to be one from the elements list "value": str, // The value to select. + "relevant": bool, // True if the value you select is relevant to the target value, otherwise False. } Context: diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index a3fdca83cc..18668579ca 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -388,7 +388,6 @@ async def handle_input_text_action( dom=dom, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, - element_trees=incremental_element, llm_handler=app.SECONDARY_LLM_API_HANDLER, step=step, task=task, @@ -402,10 +401,14 @@ async def handle_input_text_action( element_id=skyvern_element.get_id(), action=action, ) - except Exception as e: + except Exception: await skyvern_element.scroll_into_view() - LOG.exception("Failed to do custom selection transformed from input action") - return [ActionFailure(exception=e)] + LOG.warning( + "Failed to do custom selection transformed from input action, continue to input text", + exc_info=True, + task_id=task.task_id, + step_id=step.step_id, + ) finally: await skyvern_element.press_key("Escape") await skyvern_element.blur() @@ -682,7 +685,6 @@ async def handle_select_option_action( dom=dom, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, - element_trees=incremental_element, llm_handler=app.SECONDARY_LLM_API_HANDLER, step=step, task=task, @@ -1251,21 +1253,23 @@ async def select_from_dropdown( dom: DomUtil, skyvern_frame: SkyvernFrame, incremental_scraped: IncrementalScrapePage, - element_trees: list[dict], llm_handler: LLMAPIHandler, step: Step, task: Task, force_select: bool = False, + should_relevant: bool = True, ) -> tuple[ActionResult | None, str | None]: """ - force_select is used to choose an element to click even there's no dropdown menu - None will be only returned when force_select is false and no dropdown menu popped + force_select: is used to choose an element to click even there's no dropdown menu; + should_relevant: only valid when force_select is "False". When "True", the chosen value must be relevant to the target value; + None will be only returned when: + 1. force_select is false and no dropdown menu popped + 2. force_select is false and match value is not relevant to the target value """ timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS dropdown_menu_element = await locate_dropdown_menu( incremental_scraped=incremental_scraped, - element_trees=element_trees, llm_handler=llm_handler, step=step, task=task, @@ -1297,7 +1301,10 @@ async def select_from_dropdown( raise NoLabelOrValueForCustomSelection(element_id=action.element_id) prompt = prompt_engine.load_prompt( - "custom-select", context_reasoning=action.reasoning, target_value=target_value, elements=html + "custom-select", + context_reasoning=action.reasoning, + target_value=target_value, + elements=html, ) LOG.info( @@ -1320,6 +1327,16 @@ async def select_from_dropdown( if not element_id: raise NoElementMatchedForTargetOption(target=target_value, reason=json_response.get("reasoning")) + if not force_select and should_relevant: + if not json_response.get("relevant", False): + LOG.debug( + "The selected option is not relevant to the target value", + element_id=element_id, + task_id=task.task_id, + step_id=step.step_id, + ) + return None, None + try: selected_element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id) await selected_element.scroll_into_view() @@ -1362,7 +1379,7 @@ async def select_from_dropdown_by_value( step: Step, ) -> ActionResult: timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS - element_trees = await incremental_scraped.get_incremental_element_tree( + await incremental_scraped.get_incremental_element_tree( clean_and_remove_element_tree_factory(task=task, step=step, dom=dom), ) @@ -1373,7 +1390,6 @@ async def select_from_dropdown_by_value( dropdown_menu_element = await locate_dropdown_menu( incremental_scraped=incremental_scraped, - element_trees=element_trees, llm_handler=llm_handler, step=step, task=task, @@ -1419,12 +1435,35 @@ async def continue_callback(incre_scraped: IncrementalScrapePage) -> bool: async def locate_dropdown_menu( incremental_scraped: IncrementalScrapePage, - element_trees: list[dict], llm_handler: LLMAPIHandler, step: Step, task: Task, ) -> SkyvernElement | None: - for idx, element_dict in enumerate(element_trees): + skyvern_frame = incremental_scraped.skyvern_frame + + async def is_ul_or_listbox_element(element_dict: dict) -> bool: + element_id: str = element_dict.get("id", "") + try: + element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id) + except Exception: + LOG.debug( + "Failed to element in the incremental page", + element_id=element_id, + step_id=step.step_id, + task_id=task.task_id, + exc_info=True, + ) + return False + + if element.get_tag_name() == "ul": + return True + + if await element.get_attr("role") == "listbox": + return True + + return False + + for idx, element_dict in enumerate(incremental_scraped.element_tree): # FIXME: confirm max to 10 nodes for now, preventing sendindg too many requests to LLM if idx >= 10: break @@ -1432,7 +1471,7 @@ async def locate_dropdown_menu( element_id = element_dict.get("id") if not element_id: LOG.debug( - "Skip the non-interactable element for the dropdown menu confirm", + "Skip the element without id for the dropdown menu confirm", step_id=step.step_id, task_id=task.task_id, element=element_dict, @@ -1451,6 +1490,38 @@ async def locate_dropdown_menu( ) continue + found_element_id = await head_element.find_children_element_id_by_callback( + cb=is_ul_or_listbox_element, + ) + if found_element_id and found_element_id != element_id: + LOG.debug( + "Found 'ul or listbox' element in children list", + element_id=found_element_id, + step_id=step.step_id, + task_id=task.task_id, + ) + + try: + head_element = await SkyvernElement.create_from_incremental(incremental_scraped, found_element_id) + element_id = found_element_id + except Exception: + LOG.debug( + "Failed to get head element by found element id, use the orignal element id", + element_id=found_element_id, + step_id=step.step_id, + task_id=task.task_id, + exc_info=True, + ) + + if not await skyvern_frame.get_element_visible(await head_element.get_element_handler()): + LOG.debug( + "Skip the element since it's invisible", + step_id=step.step_id, + task_id=task.task_id, + element_id=element_id, + ) + continue + screenshot = await head_element.get_locator().screenshot( timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS ) diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index d89e2e7876..c0d6dc7e3a 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -888,7 +888,7 @@ function uniqueId() { return result; } -function buildElementObject(frame, element, interactable) { +function buildElementObject(frame, element, interactable, purgeable = false) { var element_id = element.getAttribute("unique_id") ?? uniqueId(); var elementTagNameLower = element.tagName.toLowerCase(); element.setAttribute("unique_id", element_id); @@ -940,6 +940,8 @@ function buildElementObject(frame, element, interactable) { text: getElementContent(element), children: [], rect: DomUtils.getVisibleClientRect(element, true), + // if purgeable is True, which means this element is only used for building the tree relationship + purgeable: purgeable, // don't trim any attr of this element if keepAllAttr=True keepAllAttr: elementTagNameLower === "svg" || element.closest("svg") !== null, @@ -979,11 +981,11 @@ function buildElementObject(frame, element, interactable) { return elementObj; } -function buildTreeFromBody(frame = "main.frame", open_select = false) { - return buildElementTree(document.body, frame, open_select); +function buildTreeFromBody(frame = "main.frame") { + return buildElementTree(document.body, frame); } -function buildElementTree(starter = document.body, frame = "main.frame") { +function buildElementTree(starter = document.body, frame, full_tree = false) { var elements = []; var resultArray = []; @@ -1078,6 +1080,23 @@ function buildElementTree(starter = document.body, frame = "main.frame") { // build all table related elements into skyvern element // we need these elements to preserve the DOM structure elementObj = buildElementObject(frame, element, false); + } else if (full_tree) { + // when building full tree, we only get text from element itself + // elements without text are purgeable + elementObj = buildElementObject(frame, element, false, true); + let textContent = ""; + if (isElementVisible(element)) { + for (let i = 0; i < element.childNodes.length; i++) { + var node = element.childNodes[i]; + if (node.nodeType === Node.TEXT_NODE) { + textContent += node.data.trim(); + } + } + } + elementObj.text = textContent; + if (textContent.length > 0) { + elementObj.purgeable = false; + } } else { // character length limit for non-interactable elements should be 5000 // we don't use element context in HTML format, @@ -1673,7 +1692,7 @@ function addIncrementalNodeToMap(parentNode, childrenNode) { } for (const child of childrenNode) { - const [_, newNodeTree] = buildElementTree(child, "", false); + const [_, newNodeTree] = buildElementTree(child, "", true); if (newNodeTree.length > 0) { newNodesTreeList.push(...newNodeTree); } diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 95567e5c5c..abe07baab3 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -109,6 +109,9 @@ def json_to_html(element: dict) -> str: for option in element.get("options", []) ) + if element.get("purgeable", False): + return children_html + option_html + # Check if the element is self-closing if tag in ["img", "input", "br", "hr", "meta", "link"] and not option_html and not children_html: return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>' @@ -338,7 +341,7 @@ async def get_interactable_element_tree_in_frame( unique_id = await frame_element.get_attribute("unique_id") - frame_js_script = f"() => buildTreeFromBody('{unique_id}', true)" + frame_js_script = f"() => buildTreeFromBody('{unique_id}')" await frame.evaluate(JS_FUNCTION_DEFS) frame_elements, frame_element_tree = await frame.evaluate(frame_js_script) @@ -374,7 +377,7 @@ async def get_interactable_element_tree( :return: Tuple containing the element tree and a map of element IDs to elements. """ await page.evaluate(JS_FUNCTION_DEFS) - main_frame_js_script = "() => buildTreeFromBody('main.frame', true)" + main_frame_js_script = "() => buildTreeFromBody()" elements, element_tree = await page.evaluate(main_frame_js_script) if len(page.main_frame.child_frames) > 0: @@ -504,8 +507,7 @@ def trim_element_tree(elements: list[dict]) -> list[dict]: del queue_ele["attributes"] if "attributes" in queue_ele and not queue_ele.get("keepAllAttr", False): - tag_name = queue_ele["tagName"] if "tagName" in queue_ele else "" - new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"]) + new_attributes = _trimmed_attributes(queue_ele["attributes"]) if new_attributes: queue_ele["attributes"] = new_attributes else: @@ -536,13 +538,10 @@ def _trimmed_base64_data(attributes: dict) -> dict: return new_attributes -def _trimmed_attributes(tag_name: str, attributes: dict) -> dict: +def _trimmed_attributes(attributes: dict) -> dict: new_attributes: dict = {} for key in attributes: - if key == "id" and tag_name in ["input", "textarea", "select"]: - # We don't want to remove the id attribute any of these elements in case there's a label for it - new_attributes[key] = attributes[key] if key == "role" and attributes[key] in ["listbox", "option"]: new_attributes[key] = attributes[key] if key in RESERVED_ATTRIBUTES and attributes[key]: diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 1dedb25e6f..86afe6a2ad 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -269,6 +269,23 @@ def find_element_id_in_label_children(self, element_type: InteractiveElement) -> return None + async def find_children_element_id_by_callback( + self, cb: typing.Callable[[dict], typing.Awaitable[bool]] + ) -> str | None: + index = 0 + queue = [self.get_element_dict()] + while index < len(queue): + item = queue[index] + if await cb(item): + return item.get("id", "") + + children: list[dict] = item.get("children", []) + for child in children: + queue.append(child) + + index += 1 + return None + async def find_label_for( self, dom: DomUtil, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS ) -> SkyvernElement | None: diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 5ed9d92af2..622c07e03f 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -188,6 +188,10 @@ async def get_element_scrollable(self, element: ElementHandle) -> bool: js_script = "(element) => isScrollable(element)" return await self.frame.evaluate(js_script, element) + async def get_element_visible(self, element: ElementHandle) -> bool: + js_script = "(element) => isElementVisible(element) && !isHidden(element)" + return await self.frame.evaluate(js_script, element) + async def scroll_to_top(self, draw_boxes: bool) -> float: """ Scroll to the top of the page and take a screenshot. diff --git a/streamlit_app/visualizer/api.py b/streamlit_app/visualizer/api.py index 654fc015b9..0b406cd0ab 100644 --- a/streamlit_app/visualizer/api.py +++ b/streamlit_app/visualizer/api.py @@ -16,18 +16,20 @@ def __init__(self, base_url: str, credentials: str): self.base_url = base_url self.credentials = credentials - def generate_curl_params(self, task_request_body: TaskRequest) -> PreparedRequest: + def generate_curl_params(self, task_request_body: TaskRequest, max_steps: int | None = None) -> PreparedRequest: url = f"{self.base_url}/tasks" payload = task_request_body.model_dump() headers = { "Content-Type": "application/json", "x-api-key": self.credentials, } + if max_steps is not None: + headers["x-max-steps-override"] = str(max_steps) return url, payload, headers - def create_task(self, task_request_body: TaskRequest) -> str | None: - url, payload, headers = self.generate_curl_params(task_request_body) + def create_task(self, task_request_body: TaskRequest, max_steps: int | None = None) -> str | None: + url, payload, headers = self.generate_curl_params(task_request_body, max_steps=max_steps) response = requests.post(url, headers=headers, data=json.dumps(payload)) if "task_id" not in response.json():