Skip to content

Commit

Permalink
better detect dropdown menu (#778)
Browse files Browse the repository at this point in the history
  • Loading branch information
LawyZheng authored Sep 7, 2024
1 parent 95b2e53 commit 692ffb6
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 31 deletions.
1 change: 1 addition & 0 deletions skyvern/forge/prompts/skyvern/custom-select.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Reply in JSON format with the following keys:
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"id": str, // The id of the element to take action on. The id has to be one from the elements list
"value": str, // The value to select.
"relevant": bool, // True if the value you select is relevant to the target value, otherwise False.
}

Context:
Expand Down
101 changes: 86 additions & 15 deletions skyvern/webeye/actions/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,6 @@ async def handle_input_text_action(
dom=dom,
skyvern_frame=skyvern_frame,
incremental_scraped=incremental_scraped,
element_trees=incremental_element,
llm_handler=app.SECONDARY_LLM_API_HANDLER,
step=step,
task=task,
Expand All @@ -402,10 +401,14 @@ async def handle_input_text_action(
element_id=skyvern_element.get_id(),
action=action,
)
except Exception as e:
except Exception:
await skyvern_element.scroll_into_view()
LOG.exception("Failed to do custom selection transformed from input action")
return [ActionFailure(exception=e)]
LOG.warning(
"Failed to do custom selection transformed from input action, continue to input text",
exc_info=True,
task_id=task.task_id,
step_id=step.step_id,
)
finally:
await skyvern_element.press_key("Escape")
await skyvern_element.blur()
Expand Down Expand Up @@ -682,7 +685,6 @@ async def handle_select_option_action(
dom=dom,
skyvern_frame=skyvern_frame,
incremental_scraped=incremental_scraped,
element_trees=incremental_element,
llm_handler=app.SECONDARY_LLM_API_HANDLER,
step=step,
task=task,
Expand Down Expand Up @@ -1251,21 +1253,23 @@ async def select_from_dropdown(
dom: DomUtil,
skyvern_frame: SkyvernFrame,
incremental_scraped: IncrementalScrapePage,
element_trees: list[dict],
llm_handler: LLMAPIHandler,
step: Step,
task: Task,
force_select: bool = False,
should_relevant: bool = True,
) -> tuple[ActionResult | None, str | None]:
"""
force_select is used to choose an element to click even there's no dropdown menu
None will be only returned when force_select is false and no dropdown menu popped
force_select: is used to choose an element to click even there's no dropdown menu;
should_relevant: only valid when force_select is "False". When "True", the chosen value must be relevant to the target value;
None will be only returned when:
1. force_select is false and no dropdown menu popped
2. force_select is false and match value is not relevant to the target value
"""
timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS

dropdown_menu_element = await locate_dropdown_menu(
incremental_scraped=incremental_scraped,
element_trees=element_trees,
llm_handler=llm_handler,
step=step,
task=task,
Expand Down Expand Up @@ -1297,7 +1301,10 @@ async def select_from_dropdown(
raise NoLabelOrValueForCustomSelection(element_id=action.element_id)

prompt = prompt_engine.load_prompt(
"custom-select", context_reasoning=action.reasoning, target_value=target_value, elements=html
"custom-select",
context_reasoning=action.reasoning,
target_value=target_value,
elements=html,
)

LOG.info(
Expand All @@ -1320,6 +1327,16 @@ async def select_from_dropdown(
if not element_id:
raise NoElementMatchedForTargetOption(target=target_value, reason=json_response.get("reasoning"))

if not force_select and should_relevant:
if not json_response.get("relevant", False):
LOG.debug(
"The selected option is not relevant to the target value",
element_id=element_id,
task_id=task.task_id,
step_id=step.step_id,
)
return None, None

try:
selected_element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id)
await selected_element.scroll_into_view()
Expand Down Expand Up @@ -1362,7 +1379,7 @@ async def select_from_dropdown_by_value(
step: Step,
) -> ActionResult:
timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
element_trees = await incremental_scraped.get_incremental_element_tree(
await incremental_scraped.get_incremental_element_tree(
clean_and_remove_element_tree_factory(task=task, step=step, dom=dom),
)

Expand All @@ -1373,7 +1390,6 @@ async def select_from_dropdown_by_value(

dropdown_menu_element = await locate_dropdown_menu(
incremental_scraped=incremental_scraped,
element_trees=element_trees,
llm_handler=llm_handler,
step=step,
task=task,
Expand Down Expand Up @@ -1419,20 +1435,43 @@ async def continue_callback(incre_scraped: IncrementalScrapePage) -> bool:

async def locate_dropdown_menu(
incremental_scraped: IncrementalScrapePage,
element_trees: list[dict],
llm_handler: LLMAPIHandler,
step: Step,
task: Task,
) -> SkyvernElement | None:
for idx, element_dict in enumerate(element_trees):
skyvern_frame = incremental_scraped.skyvern_frame

async def is_ul_or_listbox_element(element_dict: dict) -> bool:
element_id: str = element_dict.get("id", "")
try:
element = await SkyvernElement.create_from_incremental(incremental_scraped, element_id)
except Exception:
LOG.debug(
"Failed to element in the incremental page",
element_id=element_id,
step_id=step.step_id,
task_id=task.task_id,
exc_info=True,
)
return False

if element.get_tag_name() == "ul":
return True

if await element.get_attr("role") == "listbox":
return True

return False

for idx, element_dict in enumerate(incremental_scraped.element_tree):
# FIXME: confirm max to 10 nodes for now, preventing sendindg too many requests to LLM
if idx >= 10:
break

element_id = element_dict.get("id")
if not element_id:
LOG.debug(
"Skip the non-interactable element for the dropdown menu confirm",
"Skip the element without id for the dropdown menu confirm",
step_id=step.step_id,
task_id=task.task_id,
element=element_dict,
Expand All @@ -1451,6 +1490,38 @@ async def locate_dropdown_menu(
)
continue

found_element_id = await head_element.find_children_element_id_by_callback(
cb=is_ul_or_listbox_element,
)
if found_element_id and found_element_id != element_id:
LOG.debug(
"Found 'ul or listbox' element in children list",
element_id=found_element_id,
step_id=step.step_id,
task_id=task.task_id,
)

try:
head_element = await SkyvernElement.create_from_incremental(incremental_scraped, found_element_id)
element_id = found_element_id
except Exception:
LOG.debug(
"Failed to get head element by found element id, use the orignal element id",
element_id=found_element_id,
step_id=step.step_id,
task_id=task.task_id,
exc_info=True,
)

if not await skyvern_frame.get_element_visible(await head_element.get_element_handler()):
LOG.debug(
"Skip the element since it's invisible",
step_id=step.step_id,
task_id=task.task_id,
element_id=element_id,
)
continue

screenshot = await head_element.get_locator().screenshot(
timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS
)
Expand Down
29 changes: 24 additions & 5 deletions skyvern/webeye/scraper/domUtils.js
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,7 @@ function uniqueId() {
return result;
}

function buildElementObject(frame, element, interactable) {
function buildElementObject(frame, element, interactable, purgeable = false) {
var element_id = element.getAttribute("unique_id") ?? uniqueId();
var elementTagNameLower = element.tagName.toLowerCase();
element.setAttribute("unique_id", element_id);
Expand Down Expand Up @@ -940,6 +940,8 @@ function buildElementObject(frame, element, interactable) {
text: getElementContent(element),
children: [],
rect: DomUtils.getVisibleClientRect(element, true),
// if purgeable is True, which means this element is only used for building the tree relationship
purgeable: purgeable,
// don't trim any attr of this element if keepAllAttr=True
keepAllAttr:
elementTagNameLower === "svg" || element.closest("svg") !== null,
Expand Down Expand Up @@ -979,11 +981,11 @@ function buildElementObject(frame, element, interactable) {
return elementObj;
}

function buildTreeFromBody(frame = "main.frame", open_select = false) {
return buildElementTree(document.body, frame, open_select);
function buildTreeFromBody(frame = "main.frame") {
return buildElementTree(document.body, frame);
}

function buildElementTree(starter = document.body, frame = "main.frame") {
function buildElementTree(starter = document.body, frame, full_tree = false) {
var elements = [];
var resultArray = [];

Expand Down Expand Up @@ -1078,6 +1080,23 @@ function buildElementTree(starter = document.body, frame = "main.frame") {
// build all table related elements into skyvern element
// we need these elements to preserve the DOM structure
elementObj = buildElementObject(frame, element, false);
} else if (full_tree) {
// when building full tree, we only get text from element itself
// elements without text are purgeable
elementObj = buildElementObject(frame, element, false, true);
let textContent = "";
if (isElementVisible(element)) {
for (let i = 0; i < element.childNodes.length; i++) {
var node = element.childNodes[i];
if (node.nodeType === Node.TEXT_NODE) {
textContent += node.data.trim();
}
}
}
elementObj.text = textContent;
if (textContent.length > 0) {
elementObj.purgeable = false;
}
} else {
// character length limit for non-interactable elements should be 5000
// we don't use element context in HTML format,
Expand Down Expand Up @@ -1673,7 +1692,7 @@ function addIncrementalNodeToMap(parentNode, childrenNode) {
}

for (const child of childrenNode) {
const [_, newNodeTree] = buildElementTree(child, "", false);
const [_, newNodeTree] = buildElementTree(child, "", true);
if (newNodeTree.length > 0) {
newNodesTreeList.push(...newNodeTree);
}
Expand Down
15 changes: 7 additions & 8 deletions skyvern/webeye/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ def json_to_html(element: dict) -> str:
for option in element.get("options", [])
)

if element.get("purgeable", False):
return children_html + option_html

# Check if the element is self-closing
if tag in ["img", "input", "br", "hr", "meta", "link"] and not option_html and not children_html:
return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>'
Expand Down Expand Up @@ -338,7 +341,7 @@ async def get_interactable_element_tree_in_frame(

unique_id = await frame_element.get_attribute("unique_id")

frame_js_script = f"() => buildTreeFromBody('{unique_id}', true)"
frame_js_script = f"() => buildTreeFromBody('{unique_id}')"

await frame.evaluate(JS_FUNCTION_DEFS)
frame_elements, frame_element_tree = await frame.evaluate(frame_js_script)
Expand Down Expand Up @@ -374,7 +377,7 @@ async def get_interactable_element_tree(
:return: Tuple containing the element tree and a map of element IDs to elements.
"""
await page.evaluate(JS_FUNCTION_DEFS)
main_frame_js_script = "() => buildTreeFromBody('main.frame', true)"
main_frame_js_script = "() => buildTreeFromBody()"
elements, element_tree = await page.evaluate(main_frame_js_script)

if len(page.main_frame.child_frames) > 0:
Expand Down Expand Up @@ -504,8 +507,7 @@ def trim_element_tree(elements: list[dict]) -> list[dict]:
del queue_ele["attributes"]

if "attributes" in queue_ele and not queue_ele.get("keepAllAttr", False):
tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
new_attributes = _trimmed_attributes(queue_ele["attributes"])
if new_attributes:
queue_ele["attributes"] = new_attributes
else:
Expand Down Expand Up @@ -536,13 +538,10 @@ def _trimmed_base64_data(attributes: dict) -> dict:
return new_attributes


def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
def _trimmed_attributes(attributes: dict) -> dict:
new_attributes: dict = {}

for key in attributes:
if key == "id" and tag_name in ["input", "textarea", "select"]:
# We don't want to remove the id attribute any of these elements in case there's a label for it
new_attributes[key] = attributes[key]
if key == "role" and attributes[key] in ["listbox", "option"]:
new_attributes[key] = attributes[key]
if key in RESERVED_ATTRIBUTES and attributes[key]:
Expand Down
17 changes: 17 additions & 0 deletions skyvern/webeye/utils/dom.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,23 @@ def find_element_id_in_label_children(self, element_type: InteractiveElement) ->

return None

async def find_children_element_id_by_callback(
self, cb: typing.Callable[[dict], typing.Awaitable[bool]]
) -> str | None:
index = 0
queue = [self.get_element_dict()]
while index < len(queue):
item = queue[index]
if await cb(item):
return item.get("id", "")

children: list[dict] = item.get("children", [])
for child in children:
queue.append(child)

index += 1
return None

async def find_label_for(
self, dom: DomUtil, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
) -> SkyvernElement | None:
Expand Down
4 changes: 4 additions & 0 deletions skyvern/webeye/utils/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ async def get_element_scrollable(self, element: ElementHandle) -> bool:
js_script = "(element) => isScrollable(element)"
return await self.frame.evaluate(js_script, element)

async def get_element_visible(self, element: ElementHandle) -> bool:
js_script = "(element) => isElementVisible(element) && !isHidden(element)"
return await self.frame.evaluate(js_script, element)

async def scroll_to_top(self, draw_boxes: bool) -> float:
"""
Scroll to the top of the page and take a screenshot.
Expand Down
8 changes: 5 additions & 3 deletions streamlit_app/visualizer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,20 @@ def __init__(self, base_url: str, credentials: str):
self.base_url = base_url
self.credentials = credentials

def generate_curl_params(self, task_request_body: TaskRequest) -> PreparedRequest:
def generate_curl_params(self, task_request_body: TaskRequest, max_steps: int | None = None) -> PreparedRequest:
url = f"{self.base_url}/tasks"
payload = task_request_body.model_dump()
headers = {
"Content-Type": "application/json",
"x-api-key": self.credentials,
}
if max_steps is not None:
headers["x-max-steps-override"] = str(max_steps)

return url, payload, headers

def create_task(self, task_request_body: TaskRequest) -> str | None:
url, payload, headers = self.generate_curl_params(task_request_body)
def create_task(self, task_request_body: TaskRequest, max_steps: int | None = None) -> str | None:
url, payload, headers = self.generate_curl_params(task_request_body, max_steps=max_steps)

response = requests.post(url, headers=headers, data=json.dumps(payload))
if "task_id" not in response.json():
Expand Down

0 comments on commit 692ffb6

Please sign in to comment.