From 388be7da664441f521c50b679352955571bf790b Mon Sep 17 00:00:00 2001 From: Pierre-Loic Doulcet Date: Mon, 28 Oct 2024 17:35:40 +0100 Subject: [PATCH 1/4] add params --- llama_parse/base.py | 253 +++++++++++++++---------------------------- llama_parse/utils.py | 1 + 2 files changed, 89 insertions(+), 165 deletions(-) diff --git a/llama_parse/base.py b/llama_parse/base.py index 4bdbf46..44a815e 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -34,159 +34,136 @@ _DEFAULT_SEPARATOR = "\n---\n" -class LlamaParse(BasePydanticReader): - """A smart-parser for files.""" - - api_key: str = Field( - default="", - description="The API key for the LlamaParse API.", - validate_default=True, - ) - base_url: str = Field( - default=DEFAULT_BASE_URL, - description="The base URL of the Llama Parsing API.", - ) - result_type: ResultType = Field( - default=ResultType.TXT, description="The result type for the parser." - ) - num_workers: int = Field( - default=4, - gt=0, - lt=10, - description="The number of workers to use sending API requests for parsing.", - ) - check_interval: int = Field( - default=1, - description="The interval in seconds to check if the parsing is done.", - ) - max_timeout: int = Field( - default=2000, - description="The maximum timeout in seconds to wait for the parsing to finish.", - ) - verbose: bool = Field( - default=True, description="Whether to print the progress of the parsing." - ) - show_progress: bool = Field( - default=True, description="Show progress when parsing multiple files." - ) - language: Language = Field( - default=Language.ENGLISH, description="The language of the text to parse." - ) - parsing_instruction: Optional[str] = Field( - default="", description="The parsing instruction for the parser." +class LlamaParseParams(BasePydanticReader): + bounding_box: Optional[str] = Field( + default=None, + description="The bounding box to use to extract text from documents describe as a string containing the bounding box margins", ) - skip_diagonal_text: Optional[bool] = Field( + + continuous_mode: Optional[bool] = Field( default=False, - description="If set to true, the parser will ignore diagonal text (when the text rotation in degrees modulo 90 is not 0).", - ) - invalidate_cache: Optional[bool] = Field( - default=False, - description="If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice.", + description="If set to true, the parser will to merge together following tables", ) do_not_cache: Optional[bool] = Field( default=False, description="If set to true, the document will not be cached. This mean that you will be re-charged it you reprocess them as they will not be cached.", ) + do_not_unroll_columns: Optional[bool] = Field( + default=False, + description="If set to true, the parser will keep column in the text according to document layout. Reduce reconstruction accuracy, and LLM's/embedings performances in most case.", + ) fast_mode: Optional[bool] = Field( default=False, - description="Note: Non compatible with gpt-4o. If set to true, the parser will use a faster mode to extract text from documents. This mode will skip OCR of images, and table/heading reconstruction.", + description="Note: Non compatible with other modes. If set to true, the parser will use a faster mode to extract text from documents. This mode will skip OCR of images, and table/heading reconstruction.", ) - premium_mode: bool = Field( + gpt4o_api_key: Optional[str] = Field( + default=None, + description="(deprecated use vendor_multimodal_model_name='gpt-4o' instead). The API key for the GPT-4o API. Lowers the cost of parsing.", + ) + gpt4o_mode: bool = Field( default=False, - description="Use our best parser mode if set to True.", + description="(deprecated use vendor_multimodal_api_key='gpt-4o' instead). Whether to use gpt-4o extract text from documents.", ) - continuous_mode: bool = Field( + guess_xlsx_sheet_name: Optional[str] = Field( default=False, - description="Parse documents continuously, leading to better results on documents where tables span across two pages.", + description="Experimental: If set to true, when outputting to xlsx, the parser will try to guess the sheet name based on the context of the table.", ) - do_not_unroll_columns: Optional[bool] = Field( + ignore_errors: bool = Field( + default=True, + description="Whether or not to ignore and skip errors raised during parsing.", + ) + invalidate_cache: Optional[bool] = Field( default=False, - description="If set to true, the parser will keep column in the text according to document layout. Reduce reconstruction accuracy, and LLM's/embedings performances in most case.", + description="If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice.", ) - page_separator: Optional[str] = Field( - default=None, - description="A templated page separator to use to split the text. If it contain `{page_number}`,it will be replaced by the next page number. If not set will the default separator '\\n---\\n' will be used.", + language: Language = Field( + default=Language.ENGLISH, description="The language of the text to parse." ) page_prefix: Optional[str] = Field( default=None, description="A templated prefix to add to the beginning of each page. If it contain `{page_number}`, it will be replaced by the page number.", ) + page_separator: Optional[str] = Field( + default=None, + description="A templated page separator to use to split the text. If it contain `{page_number}`,it will be replaced by the next page number. If not set will the default separator '\\n---\\n' will be used.", + ) page_suffix: Optional[str] = Field( default=None, description="A templated suffix to add to the beginning of each page. If it contain `{page_number}`, it will be replaced by the page number.", ) - gpt4o_mode: bool = Field( + parsing_instruction: Optional[str] = Field( + default="", description="The parsing instruction for the parser." + ) + skip_diagonal_text: Optional[bool] = Field( default=False, - description="Whether to use gpt-4o extract text from documents.", + description="If set to true, the parser will ignore diagonal text (when the text rotation in degrees modulo 90 is not 0).", ) - gpt4o_api_key: Optional[str] = Field( - default=None, - description="The API key for the GPT-4o API. Lowers the cost of parsing.", + split_by_page: bool = Field( + default=True, + description="Whether to split by page using the page separator", ) - bounding_box: Optional[str] = Field( - default=None, - description="The bounding box to use to extract text from documents describe as a string containing the bounding box margins", + take_screenshot: bool = Field( + default=False, + description="Whether to take screenshot of each page of the document.", ) target_pages: Optional[str] = Field( default=None, description="The target pages to extract text from documents. Describe as a comma separated list of page numbers. The first page of the document is page 0", ) - ignore_errors: bool = Field( - default=True, - description="Whether or not to ignore and skip errors raised during parsing.", - ) - split_by_page: bool = Field( - default=True, - description="Whether to split by page using the page separator", + use_vendor_multimodal_model: bool = Field( + default=False, + description="Whether to use the vendor multimodal API.", ) vendor_multimodal_api_key: Optional[str] = Field( default=None, description="The API key for the multimodal API.", ) - use_vendor_multimodal_model: bool = Field( - default=False, - description="Whether to use the vendor multimodal API.", - ) + vendor_multimodal_model_name: Optional[str] = Field( default=None, description="The model name for the vendor multimodal API.", ) - take_screenshot: bool = Field( - default=False, - description="Whether to take screenshot of each page of the document.", - ) - custom_client: Optional[httpx.AsyncClient] = Field( - default=None, description="A custom HTTPX client to use for sending requests." + +class LlamaParse(LlamaParseParams): + """A smart-parser for files.""" + + """Package parameters""" + api_key: str = Field( + default="", + description="The API key for the LlamaParse API.", + validate_default=True, ) - disable_ocr: bool = Field( - default=False, - description="Disable the OCR on the document. LlamaParse will only extract the copyable text from the document.", + base_url: str = Field( + default=DEFAULT_BASE_URL, + description="The base URL of the Llama Parsing API.", ) - is_formatting_instruction: bool = Field( - default=True, - description="Allow the parsing instruction to also format the output. Disable to have a cleaner markdown output.", + check_interval: int = Field( + default=1, + description="The interval in seconds to check if the parsing is done.", ) - annotate_links: bool = Field( - default=False, - description="Annotate links found in the document to extract their URL.", + custom_client: Optional[httpx.AsyncClient] = Field( + default=None, description="A custom HTTPX client to use for sending requests." ) - webhook_url: Optional[str] = Field( - default=None, - description="A URL that needs to be called at the end of the parsing job.", + num_workers: int = Field( + default=4, + gt=0, + lt=10, + description="The number of workers to use sending API requests for parsing.", ) - azure_openai_deployment_name: Optional[str] = Field( - default=None, description="Azure Openai Deployment Name" + max_timeout: int = Field( + default=2000, + description="The maximum timeout in seconds to wait for the parsing to finish.", ) - azure_openai_endpoint: Optional[str] = Field( - default=None, description="Azure Openai Endpoint" + result_type: ResultType = Field( + default=ResultType.TXT, description="The result type for the parser." ) - azure_openai_api_version: Optional[str] = Field( - default=None, description="Azure Openai API Version" + show_progress: bool = Field( + default=True, description="Show progress when parsing multiple files." ) - azure_openai_key: Optional[str] = Field( - default=None, description="Azure Openai Key" + verbose: bool = Field( + default=True, description="Whether to print the progress of the parsing." ) - + @field_validator("api_key", mode="before", check_fields=True) @classmethod def validate_api_key(cls, v: str) -> str: @@ -257,58 +234,13 @@ async def _create_job( ) data = { - "language": self.language.value, - "parsing_instruction": self.parsing_instruction, - "invalidate_cache": self.invalidate_cache, - "skip_diagonal_text": self.skip_diagonal_text, - "do_not_cache": self.do_not_cache, - "fast_mode": self.fast_mode, - "premium_mode": self.premium_mode, - "continuous_mode": self.continuous_mode, - "do_not_unroll_columns": self.do_not_unroll_columns, - "gpt4o_mode": self.gpt4o_mode, - "gpt4o_api_key": self.gpt4o_api_key, - "vendor_multimodal_api_key": self.vendor_multimodal_api_key, - "use_vendor_multimodal_model": self.use_vendor_multimodal_model, - "vendor_multimodal_model_name": self.vendor_multimodal_model_name, - "take_screenshot": self.take_screenshot, - "disable_ocr": self.disable_ocr, - "is_formatting_instruction": self.is_formatting_instruction, - "annotate_links": self.annotate_links, - } - - # only send page separator to server if it is not None - # as if a null, "" string is sent the server will then ignore the page separator instead of using the default - if self.page_separator is not None: - data["page_separator"] = self.page_separator - - if self.page_prefix is not None: - data["page_prefix"] = self.page_prefix - - if self.page_suffix is not None: - data["page_suffix"] = self.page_suffix - - if self.bounding_box is not None: - data["bounding_box"] = self.bounding_box - - if self.target_pages is not None: - data["target_pages"] = self.target_pages - - if self.webhook_url is not None: - data["webhook_url"] = self.webhook_url - - # Azure OpenAI - if self.azure_openai_deployment_name is not None: - data["azure_openai_deployment_name"] = self.azure_openai_deployment_name - - if self.azure_openai_endpoint is not None: - data["azure_openai_endpoint"] = self.azure_openai_endpoint - - if self.azure_openai_api_version is not None: - data["azure_openai_api_version"] = self.azure_openai_api_version - - if self.azure_openai_key is not None: - data["azure_openai_key"] = self.azure_openai_key + } + + # for each key of LlamaParseParams + # if the value is not None, add it to the data + for key in LlamaParseParams: + if getattr(self, key) is not None: + data[key] = getattr(self, key) try: async with self.client_context() as client: @@ -360,8 +292,7 @@ async def _get_job_result( continue # Allowed values "PENDING", "SUCCESS", "ERROR", "CANCELED" - result_json = result.json() - status = result_json["status"] + status = result.json()["status"] if status == "SUCCESS": parsed_result = await client.get(result_url, headers=headers) return parsed_result.json() @@ -373,14 +304,6 @@ async def _get_job_result( print(".", end="", flush=True) await asyncio.sleep(self.check_interval) - else: - error_code = result_json.get("error_code", "No error code found") - error_message = result_json.get( - "error_message", "No error message found" - ) - - exception_str = f"Job ID: {job_id} failed with status: {status}, Error code: {error_code}, Error message: {error_message}" - raise Exception(exception_str) async def _aload_data( self, @@ -425,7 +348,7 @@ async def aload_data( fs: Optional[AbstractFileSystem] = None, ) -> List[Document]: """Load data from the input path.""" - if isinstance(file_path, (str, PurePosixPath, Path, bytes, BufferedIOBase)): + if isinstance(file_path, (str, Path, bytes, BufferedIOBase)): return await self._aload_data( file_path, extra_info=extra_info, fs=fs, verbose=self.verbose ) diff --git a/llama_parse/utils.py b/llama_parse/utils.py index 8a2180e..c042a46 100644 --- a/llama_parse/utils.py +++ b/llama_parse/utils.py @@ -10,6 +10,7 @@ class ResultType(str, Enum): TXT = "text" MD = "markdown" + XLSX = "xlsx" class Language(str, Enum): From 2451a74095ec53e0d2c621c9352d43f4e1113535 Mon Sep 17 00:00:00 2001 From: Pierre-Loic Doulcet Date: Mon, 28 Oct 2024 22:17:42 +0100 Subject: [PATCH 2/4] new params / refactor --- llama_parse/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_parse/utils.py b/llama_parse/utils.py index c042a46..8a2180e 100644 --- a/llama_parse/utils.py +++ b/llama_parse/utils.py @@ -10,7 +10,6 @@ class ResultType(str, Enum): TXT = "text" MD = "markdown" - XLSX = "xlsx" class Language(str, Enum): From 23f562aae1225ea2669e4d6f146f0e248294344d Mon Sep 17 00:00:00 2001 From: Pierre-Loic Doulcet Date: Fri, 1 Nov 2024 16:21:21 +0100 Subject: [PATCH 3/4] add xlsx --- llama_parse/base.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/llama_parse/base.py b/llama_parse/base.py index 44a815e..7cf6025 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -39,7 +39,7 @@ class LlamaParseParams(BasePydanticReader): default=None, description="The bounding box to use to extract text from documents describe as a string containing the bounding box margins", ) - + continuous_mode: Optional[bool] = Field( default=False, description="If set to true, the parser will to merge together following tables", @@ -118,12 +118,13 @@ class LlamaParseParams(BasePydanticReader): default=None, description="The API key for the multimodal API.", ) - + vendor_multimodal_model_name: Optional[str] = Field( default=None, description="The model name for the vendor multimodal API.", ) + class LlamaParse(LlamaParseParams): """A smart-parser for files.""" @@ -163,7 +164,7 @@ class LlamaParse(LlamaParseParams): verbose: bool = Field( default=True, description="Whether to print the progress of the parsing." ) - + @field_validator("api_key", mode="before", check_fields=True) @classmethod def validate_api_key(cls, v: str) -> str: @@ -233,12 +234,13 @@ async def _create_job( "file_input must be either a file path string, file bytes, or buffer object" ) - data = { - } + data = {} # for each key of LlamaParseParams # if the value is not None, add it to the data - for key in LlamaParseParams: + llama_keys = LlamaParseParams.__annotations__.keys() + + for key in llama_keys: if getattr(self, key) is not None: data[key] = getattr(self, key) From 47d98152c3694e3738f1b87e6672c143359400ef Mon Sep 17 00:00:00 2001 From: Pierre-Loic Doulcet Date: Fri, 1 Nov 2024 16:23:42 +0100 Subject: [PATCH 4/4] add comment --- llama_parse/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama_parse/base.py b/llama_parse/base.py index 7cf6025..606586c 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -35,6 +35,8 @@ class LlamaParseParams(BasePydanticReader): + """This class contain only the parameter that are used by the LlamaParse API""" + bounding_box: Optional[str] = Field( default=None, description="The bounding box to use to extract text from documents describe as a string containing the bounding box margins", @@ -244,6 +246,9 @@ async def _create_job( if getattr(self, key) is not None: data[key] = getattr(self, key) + # To track that the job was created from the Python client and better handle bugs + data["from_python_client"] = True + try: async with self.client_context() as client: response = await client.post(