Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pierre/new params #468

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
258 changes: 94 additions & 164 deletions llama_parse/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,157 +34,137 @@
_DEFAULT_SEPARATOR = "\n---\n"


class LlamaParse(BasePydanticReader):
"""A smart-parser for files."""
class LlamaParseParams(BasePydanticReader):
hexapode marked this conversation as resolved.
Show resolved Hide resolved
"""This class contain only the parameter that are used by the LlamaParse API"""

api_key: str = Field(
default="",
description="The API key for the LlamaParse API.",
validate_default=True,
)
base_url: str = Field(
default=DEFAULT_BASE_URL,
description="The base URL of the Llama Parsing API.",
)
result_type: ResultType = Field(
default=ResultType.TXT, description="The result type for the parser."
)
num_workers: int = Field(
default=4,
gt=0,
lt=10,
description="The number of workers to use sending API requests for parsing.",
)
check_interval: int = Field(
default=1,
description="The interval in seconds to check if the parsing is done.",
)
max_timeout: int = Field(
default=2000,
description="The maximum timeout in seconds to wait for the parsing to finish.",
)
verbose: bool = Field(
default=True, description="Whether to print the progress of the parsing."
)
show_progress: bool = Field(
default=True, description="Show progress when parsing multiple files."
)
language: Language = Field(
default=Language.ENGLISH, description="The language of the text to parse."
)
parsing_instruction: Optional[str] = Field(
default="", description="The parsing instruction for the parser."
)
skip_diagonal_text: Optional[bool] = Field(
default=False,
description="If set to true, the parser will ignore diagonal text (when the text rotation in degrees modulo 90 is not 0).",
bounding_box: Optional[str] = Field(
default=None,
description="The bounding box to use to extract text from documents describe as a string containing the bounding box margins",
)
invalidate_cache: Optional[bool] = Field(

continuous_mode: Optional[bool] = Field(
default=False,
description="If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice.",
description="If set to true, the parser will to merge together following tables",
)
do_not_cache: Optional[bool] = Field(
default=False,
description="If set to true, the document will not be cached. This mean that you will be re-charged it you reprocess them as they will not be cached.",
)
do_not_unroll_columns: Optional[bool] = Field(
default=False,
description="If set to true, the parser will keep column in the text according to document layout. Reduce reconstruction accuracy, and LLM's/embedings performances in most case.",
)
fast_mode: Optional[bool] = Field(
default=False,
description="Note: Non compatible with gpt-4o. If set to true, the parser will use a faster mode to extract text from documents. This mode will skip OCR of images, and table/heading reconstruction.",
description="Note: Non compatible with other modes. If set to true, the parser will use a faster mode to extract text from documents. This mode will skip OCR of images, and table/heading reconstruction.",
)
premium_mode: bool = Field(
gpt4o_api_key: Optional[str] = Field(
default=None,
description="(deprecated use vendor_multimodal_model_name='gpt-4o' instead). The API key for the GPT-4o API. Lowers the cost of parsing.",
)
gpt4o_mode: bool = Field(
default=False,
description="Use our best parser mode if set to True.",
description="(deprecated use vendor_multimodal_api_key='gpt-4o' instead). Whether to use gpt-4o extract text from documents.",
)
continuous_mode: bool = Field(
guess_xlsx_sheet_name: Optional[str] = Field(
default=False,
description="Parse documents continuously, leading to better results on documents where tables span across two pages.",
description="Experimental: If set to true, when outputting to xlsx, the parser will try to guess the sheet name based on the context of the table.",
)
do_not_unroll_columns: Optional[bool] = Field(
ignore_errors: bool = Field(
default=True,
description="Whether or not to ignore and skip errors raised during parsing.",
)
invalidate_cache: Optional[bool] = Field(
default=False,
description="If set to true, the parser will keep column in the text according to document layout. Reduce reconstruction accuracy, and LLM's/embedings performances in most case.",
description="If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice.",
)
page_separator: Optional[str] = Field(
default=None,
description="A templated page separator to use to split the text. If it contain `{page_number}`,it will be replaced by the next page number. If not set will the default separator '\\n---\\n' will be used.",
language: Language = Field(
default=Language.ENGLISH, description="The language of the text to parse."
)
page_prefix: Optional[str] = Field(
default=None,
description="A templated prefix to add to the beginning of each page. If it contain `{page_number}`, it will be replaced by the page number.",
)
page_separator: Optional[str] = Field(
default=None,
description="A templated page separator to use to split the text. If it contain `{page_number}`,it will be replaced by the next page number. If not set will the default separator '\\n---\\n' will be used.",
)
page_suffix: Optional[str] = Field(
default=None,
description="A templated suffix to add to the beginning of each page. If it contain `{page_number}`, it will be replaced by the page number.",
)
gpt4o_mode: bool = Field(
parsing_instruction: Optional[str] = Field(
default="", description="The parsing instruction for the parser."
)
skip_diagonal_text: Optional[bool] = Field(
default=False,
description="Whether to use gpt-4o extract text from documents.",
description="If set to true, the parser will ignore diagonal text (when the text rotation in degrees modulo 90 is not 0).",
)
gpt4o_api_key: Optional[str] = Field(
default=None,
description="The API key for the GPT-4o API. Lowers the cost of parsing.",
split_by_page: bool = Field(
default=True,
description="Whether to split by page using the page separator",
)
bounding_box: Optional[str] = Field(
default=None,
description="The bounding box to use to extract text from documents describe as a string containing the bounding box margins",
take_screenshot: bool = Field(
default=False,
description="Whether to take screenshot of each page of the document.",
)
target_pages: Optional[str] = Field(
default=None,
description="The target pages to extract text from documents. Describe as a comma separated list of page numbers. The first page of the document is page 0",
)
ignore_errors: bool = Field(
default=True,
description="Whether or not to ignore and skip errors raised during parsing.",
)
split_by_page: bool = Field(
default=True,
description="Whether to split by page using the page separator",
use_vendor_multimodal_model: bool = Field(
default=False,
description="Whether to use the vendor multimodal API.",
)
vendor_multimodal_api_key: Optional[str] = Field(
default=None,
description="The API key for the multimodal API.",
)
use_vendor_multimodal_model: bool = Field(
default=False,
description="Whether to use the vendor multimodal API.",
)

vendor_multimodal_model_name: Optional[str] = Field(
default=None,
description="The model name for the vendor multimodal API.",
)
take_screenshot: bool = Field(
default=False,
description="Whether to take screenshot of each page of the document.",
)
custom_client: Optional[httpx.AsyncClient] = Field(
default=None, description="A custom HTTPX client to use for sending requests."


class LlamaParse(LlamaParseParams):
"""A smart-parser for files."""

"""Package parameters"""
api_key: str = Field(
default="",
description="The API key for the LlamaParse API.",
validate_default=True,
)
disable_ocr: bool = Field(
default=False,
description="Disable the OCR on the document. LlamaParse will only extract the copyable text from the document.",
base_url: str = Field(
default=DEFAULT_BASE_URL,
description="The base URL of the Llama Parsing API.",
)
is_formatting_instruction: bool = Field(
default=True,
description="Allow the parsing instruction to also format the output. Disable to have a cleaner markdown output.",
check_interval: int = Field(
default=1,
description="The interval in seconds to check if the parsing is done.",
)
annotate_links: bool = Field(
default=False,
description="Annotate links found in the document to extract their URL.",
custom_client: Optional[httpx.AsyncClient] = Field(
default=None, description="A custom HTTPX client to use for sending requests."
)
webhook_url: Optional[str] = Field(
default=None,
description="A URL that needs to be called at the end of the parsing job.",
num_workers: int = Field(
default=4,
gt=0,
lt=10,
description="The number of workers to use sending API requests for parsing.",
)
azure_openai_deployment_name: Optional[str] = Field(
default=None, description="Azure Openai Deployment Name"
max_timeout: int = Field(
default=2000,
description="The maximum timeout in seconds to wait for the parsing to finish.",
)
azure_openai_endpoint: Optional[str] = Field(
default=None, description="Azure Openai Endpoint"
result_type: ResultType = Field(
default=ResultType.TXT, description="The result type for the parser."
)
azure_openai_api_version: Optional[str] = Field(
default=None, description="Azure Openai API Version"
show_progress: bool = Field(
default=True, description="Show progress when parsing multiple files."
)
azure_openai_key: Optional[str] = Field(
default=None, description="Azure Openai Key"
verbose: bool = Field(
default=True, description="Whether to print the progress of the parsing."
)

@field_validator("api_key", mode="before", check_fields=True)
Expand Down Expand Up @@ -256,59 +236,18 @@ async def _create_job(
"file_input must be either a file path string, file bytes, or buffer object"
)

data = {
"language": self.language.value,
"parsing_instruction": self.parsing_instruction,
"invalidate_cache": self.invalidate_cache,
"skip_diagonal_text": self.skip_diagonal_text,
"do_not_cache": self.do_not_cache,
"fast_mode": self.fast_mode,
"premium_mode": self.premium_mode,
"continuous_mode": self.continuous_mode,
"do_not_unroll_columns": self.do_not_unroll_columns,
"gpt4o_mode": self.gpt4o_mode,
"gpt4o_api_key": self.gpt4o_api_key,
"vendor_multimodal_api_key": self.vendor_multimodal_api_key,
"use_vendor_multimodal_model": self.use_vendor_multimodal_model,
"vendor_multimodal_model_name": self.vendor_multimodal_model_name,
"take_screenshot": self.take_screenshot,
"disable_ocr": self.disable_ocr,
"is_formatting_instruction": self.is_formatting_instruction,
"annotate_links": self.annotate_links,
}

# only send page separator to server if it is not None
# as if a null, "" string is sent the server will then ignore the page separator instead of using the default
if self.page_separator is not None:
data["page_separator"] = self.page_separator

if self.page_prefix is not None:
data["page_prefix"] = self.page_prefix

if self.page_suffix is not None:
data["page_suffix"] = self.page_suffix

if self.bounding_box is not None:
data["bounding_box"] = self.bounding_box

if self.target_pages is not None:
data["target_pages"] = self.target_pages

if self.webhook_url is not None:
data["webhook_url"] = self.webhook_url

# Azure OpenAI
if self.azure_openai_deployment_name is not None:
data["azure_openai_deployment_name"] = self.azure_openai_deployment_name

if self.azure_openai_endpoint is not None:
data["azure_openai_endpoint"] = self.azure_openai_endpoint

if self.azure_openai_api_version is not None:
data["azure_openai_api_version"] = self.azure_openai_api_version

if self.azure_openai_key is not None:
data["azure_openai_key"] = self.azure_openai_key
data = {}

# for each key of LlamaParseParams
# if the value is not None, add it to the data
llama_keys = LlamaParseParams.__annotations__.keys()

for key in llama_keys:
if getattr(self, key) is not None:
data[key] = getattr(self, key)
hexapode marked this conversation as resolved.
Show resolved Hide resolved

# To track that the job was created from the Python client and better handle bugs
data["from_python_client"] = True

try:
async with self.client_context() as client:
Expand Down Expand Up @@ -360,8 +299,7 @@ async def _get_job_result(
continue

# Allowed values "PENDING", "SUCCESS", "ERROR", "CANCELED"
result_json = result.json()
status = result_json["status"]
status = result.json()["status"]
if status == "SUCCESS":
parsed_result = await client.get(result_url, headers=headers)
return parsed_result.json()
Expand All @@ -373,14 +311,6 @@ async def _get_job_result(
print(".", end="", flush=True)

await asyncio.sleep(self.check_interval)
else:
error_code = result_json.get("error_code", "No error code found")
error_message = result_json.get(
"error_message", "No error message found"
)

exception_str = f"Job ID: {job_id} failed with status: {status}, Error code: {error_code}, Error message: {error_message}"
raise Exception(exception_str)

async def _aload_data(
self,
Expand Down Expand Up @@ -425,7 +355,7 @@ async def aload_data(
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
"""Load data from the input path."""
if isinstance(file_path, (str, PurePosixPath, Path, bytes, BufferedIOBase)):
if isinstance(file_path, (str, Path, bytes, BufferedIOBase)):
return await self._aload_data(
file_path, extra_info=extra_info, fs=fs, verbose=self.verbose
)
Expand Down
Loading