Skip to content

Commit

Permalink
adding parameters for API in sdk and remote_partitioner
Browse files Browse the repository at this point in the history
  • Loading branch information
Soeb Hussain committed Nov 27, 2024
1 parent b28bd4d commit 6c6ecc2
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 0 deletions.
9 changes: 9 additions & 0 deletions lib/aryn-sdk/aryn_sdk/partition/partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def partition_file(
docparse_url: Optional[str] = None,
ssl_verify: bool = True,
output_format: Optional[str] = None,
output_label_options: dict[str, Any] = {},
) -> dict:
"""
Sends file to Aryn DocParse and returns a dict of its document structure and text
Expand Down Expand Up @@ -84,6 +85,10 @@ def partition_file(
ssl_verify: verify ssl certificates. In databricks, set this to False to fix ssl imcompatibilities.
output_format: controls output representation; can be set to "markdown" or "json"
default: None (JSON elements)
output_label_options: A dictionary for configuring output label behavior. It supports two options:
promote_title, a boolean that specifies whether to add a title to partitioned elements if one is missing, and
title_candidate_elements, a list of strings representing labels for potential titles.
default: {"promote_title": True , "title_candidate_elements":["Section-header", "Caption"]}
Returns:
A dictionary containing "status", "elements", and possibly "error"
Expand Down Expand Up @@ -138,6 +143,7 @@ def partition_file(
selected_pages=selected_pages,
output_format=output_format,
chunking_options=chunking_options,
output_label_options=output_label_options,
)

_logger.debug(f"{options_str}")
Expand Down Expand Up @@ -212,6 +218,7 @@ def _json_options(
selected_pages: Optional[list[Union[list[int], int]]] = None,
output_format: Optional[str] = None,
chunking_options: Optional[dict[str, Any]] = None,
output_label_options: Optional[dict[str, Any]] = None,
) -> str:
# isn't type-checking fun
options: dict[str, Union[float, bool, str, list[Union[list[int], int]], dict[str, Any]]] = dict()
Expand All @@ -233,6 +240,8 @@ def _json_options(
options["output_format"] = output_format
if chunking_options is not None:
options["chunking_options"] = chunking_options
if output_label_options:
options["output_label_options"] = output_label_options

options["source"] = "aryn-sdk"

Expand Down
5 changes: 5 additions & 0 deletions lib/sycamore/sycamore/transforms/detr_partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def partition_pdf(
pages_per_call=pages_per_call,
output_format=output_format,
source=source,
output_label_options=output_label_options,
)
else:
if isinstance(threshold, str):
Expand Down Expand Up @@ -234,6 +235,7 @@ def _call_remote_partitioner(
selected_pages: list = [],
output_format: Optional[str] = None,
source: str = "",
output_label_options: dict[str, Any] = {},
) -> list[Element]:
file.seek(0)
options = {
Expand All @@ -244,6 +246,7 @@ def _call_remote_partitioner(
"extract_images": extract_images,
"selected_pages": selected_pages,
"source": f"sycamore-{source}" if source else "sycamore",
"output_label_options": output_label_options,
}
if output_format:
options["output_format"] = output_format
Expand Down Expand Up @@ -347,6 +350,7 @@ def _partition_remote(
pages_per_call: int = -1,
output_format: Optional[str] = None,
source: str = "",
output_label_options: dict[str, Any] = {},
) -> list[Element]:
page_count = get_page_count(file)

Expand All @@ -369,6 +373,7 @@ def _partition_remote(
selected_pages=[[low, min(high, page_count)]],
output_format=output_format,
source=source,
output_label_options=output_label_options,
)
)
low = high + 1
Expand Down

0 comments on commit 6c6ecc2

Please sign in to comment.