From 6c6ecc2560122a8fbac9d6b8689dfa540131b29d Mon Sep 17 00:00:00 2001 From: Soeb Hussain Date: Wed, 27 Nov 2024 23:30:37 +0000 Subject: [PATCH] adding parameters for API in sdk and remote_partitioner --- lib/aryn-sdk/aryn_sdk/partition/partition.py | 9 +++++++++ lib/sycamore/sycamore/transforms/detr_partitioner.py | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/lib/aryn-sdk/aryn_sdk/partition/partition.py b/lib/aryn-sdk/aryn_sdk/partition/partition.py index 39aca96fb..56898456c 100644 --- a/lib/aryn-sdk/aryn_sdk/partition/partition.py +++ b/lib/aryn-sdk/aryn_sdk/partition/partition.py @@ -37,6 +37,7 @@ def partition_file( docparse_url: Optional[str] = None, ssl_verify: bool = True, output_format: Optional[str] = None, + output_label_options: dict[str, Any] = {}, ) -> dict: """ Sends file to Aryn DocParse and returns a dict of its document structure and text @@ -84,6 +85,10 @@ def partition_file( ssl_verify: verify ssl certificates. In databricks, set this to False to fix ssl imcompatibilities. output_format: controls output representation; can be set to "markdown" or "json" default: None (JSON elements) + output_label_options: A dictionary for configuring output label behavior. It supports two options: + promote_title, a boolean that specifies whether to add a title to partitioned elements if one is missing, and + title_candidate_elements, a list of strings representing labels for potential titles. + default: {"promote_title": True , "title_candidate_elements":["Section-header", "Caption"]} Returns: A dictionary containing "status", "elements", and possibly "error" @@ -138,6 +143,7 @@ def partition_file( selected_pages=selected_pages, output_format=output_format, chunking_options=chunking_options, + output_label_options=output_label_options, ) _logger.debug(f"{options_str}") @@ -212,6 +218,7 @@ def _json_options( selected_pages: Optional[list[Union[list[int], int]]] = None, output_format: Optional[str] = None, chunking_options: Optional[dict[str, Any]] = None, + output_label_options: Optional[dict[str, Any]] = None, ) -> str: # isn't type-checking fun options: dict[str, Union[float, bool, str, list[Union[list[int], int]], dict[str, Any]]] = dict() @@ -233,6 +240,8 @@ def _json_options( options["output_format"] = output_format if chunking_options is not None: options["chunking_options"] = chunking_options + if output_label_options: + options["output_label_options"] = output_label_options options["source"] = "aryn-sdk" diff --git a/lib/sycamore/sycamore/transforms/detr_partitioner.py b/lib/sycamore/sycamore/transforms/detr_partitioner.py index 1e7e65ca4..8b6eccceb 100644 --- a/lib/sycamore/sycamore/transforms/detr_partitioner.py +++ b/lib/sycamore/sycamore/transforms/detr_partitioner.py @@ -176,6 +176,7 @@ def partition_pdf( pages_per_call=pages_per_call, output_format=output_format, source=source, + output_label_options=output_label_options, ) else: if isinstance(threshold, str): @@ -234,6 +235,7 @@ def _call_remote_partitioner( selected_pages: list = [], output_format: Optional[str] = None, source: str = "", + output_label_options: dict[str, Any] = {}, ) -> list[Element]: file.seek(0) options = { @@ -244,6 +246,7 @@ def _call_remote_partitioner( "extract_images": extract_images, "selected_pages": selected_pages, "source": f"sycamore-{source}" if source else "sycamore", + "output_label_options": output_label_options, } if output_format: options["output_format"] = output_format @@ -347,6 +350,7 @@ def _partition_remote( pages_per_call: int = -1, output_format: Optional[str] = None, source: str = "", + output_label_options: dict[str, Any] = {}, ) -> list[Element]: page_count = get_page_count(file) @@ -369,6 +373,7 @@ def _partition_remote( selected_pages=[[low, min(high, page_count)]], output_format=output_format, source=source, + output_label_options=output_label_options, ) ) low = high + 1